Message ID | 20181002161915.18843-6-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | LSE atomics out-of-line | expand |
On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote: > When the result of an operation is not used, we can ignore the > result by storing to XZR. For two of the memory models, using > XZR with LD<op> has a preferred assembler alias, ST<op>. ST<op> has different semantics to LD<op>, in particular, ST<op> is not ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics. The relevant Arm Arm text is: If the destination register is not one of WZR or XZR, LDADDA and LDADDAL load from memory with acquire semantics LDADDL and LDADDAL store to memory with release semantics. LDADD has no memory ordering requirements. I'm taking this to mean that even if the result is unused, using XZR is not a valid transformation; it weakens the expected acquire semantics to unordered. The example I have from Will Deacon on an internal bug database is: P0 (atomic_int* y,atomic_int* x) { atomic_store_explicit(x,1,memory_order_relaxed); atomic_thread_fence(memory_order_release); atomic_store_explicit(y,1,memory_order_relaxed); } P1 (atomic_int* y,atomic_int* x) { int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed); atomic_thread_fence(memory_order_acquire); int r1 = atomic_load_explicit(x,memory_order_relaxed); } The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal. This example comes from a while back in my memory; so copying Will for any more detailed questions. My impression is that this transformation is not safe, and so the patch is not OK. Thanks, James > > * config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): > Use ST<op> for relaxed and release models; load to XZR otherwise; > remove the now unnecessary scratch register. > > * gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}. > * gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly. > --- > .../gcc.target/aarch64/atomic-inst-ldadd.c | 18 ++++--- > .../gcc.target/aarch64/atomic-inst-ldlogic.c | 54 ++++++++++++------- > gcc/config/aarch64/atomics.md | 15 +++--- > 3 files changed, 57 insertions(+), 30 deletions(-) > > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c > index 4b2282c6861..db2206186b4 100644 > --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c > @@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN) > TEST (sub_load, SUB_LOAD) > TEST (sub_load_notreturn, SUB_LOAD_NORETURN) > > -/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */ > /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */ > -/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */ > /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */ > +/* { dg-final { scan-assembler-times "staddb\t" 8} } */ > +/* { dg-final { scan-assembler-times "staddlb\t" 8} } */ > > -/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */ > /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */ > -/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */ > /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */ > +/* { dg-final { scan-assembler-times "staddh\t" 8} } */ > +/* { dg-final { scan-assembler-times "staddlh\t" 8} } */ > > -/* { dg-final { scan-assembler-times "ldadd\t" 32} } */ > +/* { dg-final { scan-assembler-times "ldadd\t" 16} } */ > /* { dg-final { scan-assembler-times "ldadda\t" 64} } */ > -/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */ > +/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */ > /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */ > +/* { dg-final { scan-assembler-times "stadd\t" 16} } */ > +/* { dg-final { scan-assembler-times "staddl\t" 16} } */ > > /* { dg-final { scan-assembler-not "ldaxr\t" } } */ > /* { dg-final { scan-assembler-not "stlxr\t" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c > index 4879d52b9b4..b8a53e0a676 100644 > --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c > @@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN) > > /* Load-OR. */ > > -/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */ > /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */ > -/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */ > /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */ > +/* { dg-final { scan-assembler-times "stsetb\t" 4} } */ > +/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */ > > -/* { dg-final { scan-assembler-times "ldseth\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldseth\t" 4} } */ > /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */ > -/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */ > /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */ > +/* { dg-final { scan-assembler-times "stseth\t" 4} } */ > +/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */ > > -/* { dg-final { scan-assembler-times "ldset\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldset\t" 8} } */ > /* { dg-final { scan-assembler-times "ldseta\t" 32} } */ > -/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */ > /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */ > +/* { dg-final { scan-assembler-times "stset\t" 8} } */ > +/* { dg-final { scan-assembler-times "stsetl\t" 8} } */ > > /* Load-AND. */ > > -/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */ > /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */ > -/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */ > /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */ > +/* { dg-final { scan-assembler-times "stclrb\t" 4} } */ > +/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */ > > -/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */ > /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */ > -/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */ > /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */ > +/* { dg-final { scan-assembler-times "stclrh\t" 4} } */ > +/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */ > > -/* { dg-final { scan-assembler-times "ldclr\t" 16} */ > +/* { dg-final { scan-assembler-times "ldclr\t" 8} */ > /* { dg-final { scan-assembler-times "ldclra\t" 32} } */ > -/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */ > /* { dg-final { scan-assembler-times "ldclral\t" 32} } */ > +/* { dg-final { scan-assembler-times "stclr\t" 8} */ > +/* { dg-final { scan-assembler-times "stclrl\t" 8} } */ > > /* Load-XOR. */ > > -/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */ > /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */ > -/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */ > /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */ > +/* { dg-final { scan-assembler-times "steorb\t" 4} } */ > +/* { dg-final { scan-assembler-times "steorlb\t" 4} } */ > > -/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */ > /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */ > -/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */ > +/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */ > /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */ > +/* { dg-final { scan-assembler-times "steorh\t" 4} } */ > +/* { dg-final { scan-assembler-times "steorlh\t" 4} } */ > > -/* { dg-final { scan-assembler-times "ldeor\t" 16} */ > +/* { dg-final { scan-assembler-times "ldeor\t" 8} */ > /* { dg-final { scan-assembler-times "ldeora\t" 32} } */ > -/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */ > +/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */ > /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */ > +/* { dg-final { scan-assembler-times "steor\t" 8} */ > +/* { dg-final { scan-assembler-times "steorl\t" 8} } */ > > /* { dg-final { scan-assembler-not "ldaxr\t" } } */ > /* { dg-final { scan-assembler-not "stlxr\t" } } */ > diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md > index 2198649b1be..ee662a4480e 100644 > --- a/gcc/config/aarch64/atomics.md > +++ b/gcc/config/aarch64/atomics.md > @@ -269,19 +269,22 @@ > [(match_dup 0) > (match_operand:ALLI 1 "register_operand" "r") > (match_operand:SI 2 "const_int_operand")] > - ATOMIC_LDOP)) > - (clobber (match_scratch:ALLI 3 "=&r"))] > + ATOMIC_LDOP))] > "TARGET_LSE" > { > enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > + /* Note that while ST<op> is an alias for LD<op> with the second > + operand as XZR, the assember only defines them for the RELAXED > + and REL models. But there's nothing that prevents us from explicitly > + using XZR with LD<op> for the ACQ and ACQ_REL models. */ > if (is_mm_relaxed (model)) > - return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0"; > + return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0"; > else if (is_mm_release (model)) > - return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0"; > + return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0"; > else if (is_mm_acquire (model) || is_mm_consume (model)) > - return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0"; > + return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, <w>zr, %0"; > else > - return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0"; > + return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, <w>zr, %0"; > } > ) > > -- > 2.17.1 >
On 10/30/18 8:32 PM, James Greenhalgh wrote: > On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote: >> When the result of an operation is not used, we can ignore the >> result by storing to XZR. For two of the memory models, using >> XZR with LD<op> has a preferred assembler alias, ST<op>. > > ST<op> has different semantics to LD<op>, in particular, ST<op> is not > ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics. > > The relevant Arm Arm text is: > > If the destination register is not one of WZR or XZR, LDADDA and > LDADDAL load from memory with acquire semantics You're quite right. I must have glossed over that clause when looking at this before. I'll make sure there's a temp register to clobber for v2. r~
On 10/30/18 8:32 PM, James Greenhalgh wrote: > On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote: >> When the result of an operation is not used, we can ignore the >> result by storing to XZR. For two of the memory models, using >> XZR with LD<op> has a preferred assembler alias, ST<op>. > > ST<op> has different semantics to LD<op>, in particular, ST<op> is not > ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics. > > The relevant Arm Arm text is: > > If the destination register is not one of WZR or XZR, LDADDA and > LDADDAL load from memory with acquire semantics > > LDADDL and LDADDAL store to memory with release semantics. > > LDADD has no memory ordering requirements. > > I'm taking this to mean that even if the result is unused, using XZR is not > a valid transformation; it weakens the expected acquire semantics to > unordered. > > The example I have from Will Deacon on an internal bug database is: > > P0 (atomic_int* y,atomic_int* x) { > atomic_store_explicit(x,1,memory_order_relaxed); > atomic_thread_fence(memory_order_release); > atomic_store_explicit(y,1,memory_order_relaxed); > } > > P1 (atomic_int* y,atomic_int* x) { > int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed); > atomic_thread_fence(memory_order_acquire); > int r1 = atomic_load_explicit(x,memory_order_relaxed); > } > > The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal. > > This example comes from a while back in my memory; so copying Will for > any more detailed questions. > > My impression is that this transformation is not safe, and so the patch is > not OK. Here's a revised patch. Use ST<op> for relaxed and release orderings, retain the (non-xzr) scratch register for other orderings. But the scratch need not be early-clobber, since there's no mid-point of half-consumed inputs. r~ From 31a9f2fa6b73ecbca39c8d0b1a09f269a9d67f8d Mon Sep 17 00:00:00 2001 From: Richard Henderson <richard.henderson@linaro.org> Date: Wed, 19 Sep 2018 22:18:09 +0000 Subject: aarch64: Emit LSE st<op> instructions * config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): Use ST<op> for relaxed and release models; the scratch register need not be early-clobber. * gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}. * gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly. diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md index 2198649b1be..00536d34dfd 100644 --- a/gcc/config/aarch64/atomics.md +++ b/gcc/config/aarch64/atomics.md @@ -270,14 +270,14 @@ (match_operand:ALLI 1 "register_operand" "r") (match_operand:SI 2 "const_int_operand")] ATOMIC_LDOP)) - (clobber (match_scratch:ALLI 3 "=&r"))] + (clobber (match_scratch:ALLI 3 "=r"))] "TARGET_LSE" { enum memmodel model = memmodel_from_int (INTVAL (operands[2])); if (is_mm_relaxed (model)) - return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0"; + return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0"; else if (is_mm_release (model)) - return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0"; + return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0"; else if (is_mm_acquire (model) || is_mm_consume (model)) return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0"; else diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c index 4b2282c6861..db2206186b4 100644 --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c @@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN) TEST (sub_load, SUB_LOAD) TEST (sub_load_notreturn, SUB_LOAD_NORETURN) -/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */ -/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */ +/* { dg-final { scan-assembler-times "staddb\t" 8} } */ +/* { dg-final { scan-assembler-times "staddlb\t" 8} } */ -/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */ -/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */ +/* { dg-final { scan-assembler-times "staddh\t" 8} } */ +/* { dg-final { scan-assembler-times "staddlh\t" 8} } */ -/* { dg-final { scan-assembler-times "ldadd\t" 32} } */ +/* { dg-final { scan-assembler-times "ldadd\t" 16} } */ /* { dg-final { scan-assembler-times "ldadda\t" 64} } */ -/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */ +/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */ /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */ +/* { dg-final { scan-assembler-times "stadd\t" 16} } */ +/* { dg-final { scan-assembler-times "staddl\t" 16} } */ /* { dg-final { scan-assembler-not "ldaxr\t" } } */ /* { dg-final { scan-assembler-not "stlxr\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c index 4879d52b9b4..b8a53e0a676 100644 --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c @@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN) /* Load-OR. */ -/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */ -/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */ +/* { dg-final { scan-assembler-times "stsetb\t" 4} } */ +/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */ -/* { dg-final { scan-assembler-times "ldseth\t" 8} } */ +/* { dg-final { scan-assembler-times "ldseth\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */ -/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */ +/* { dg-final { scan-assembler-times "stseth\t" 4} } */ +/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */ -/* { dg-final { scan-assembler-times "ldset\t" 16} } */ +/* { dg-final { scan-assembler-times "ldset\t" 8} } */ /* { dg-final { scan-assembler-times "ldseta\t" 32} } */ -/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */ +/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */ /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */ +/* { dg-final { scan-assembler-times "stset\t" 8} } */ +/* { dg-final { scan-assembler-times "stsetl\t" 8} } */ /* Load-AND. */ -/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */ /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */ -/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */ /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */ +/* { dg-final { scan-assembler-times "stclrb\t" 4} } */ +/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */ -/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */ /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */ -/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */ /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */ +/* { dg-final { scan-assembler-times "stclrh\t" 4} } */ +/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */ -/* { dg-final { scan-assembler-times "ldclr\t" 16} */ +/* { dg-final { scan-assembler-times "ldclr\t" 8} */ /* { dg-final { scan-assembler-times "ldclra\t" 32} } */ -/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */ +/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */ /* { dg-final { scan-assembler-times "ldclral\t" 32} } */ +/* { dg-final { scan-assembler-times "stclr\t" 8} */ +/* { dg-final { scan-assembler-times "stclrl\t" 8} } */ /* Load-XOR. */ -/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */ /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */ -/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */ /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */ +/* { dg-final { scan-assembler-times "steorb\t" 4} } */ +/* { dg-final { scan-assembler-times "steorlb\t" 4} } */ -/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */ /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */ -/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */ /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */ +/* { dg-final { scan-assembler-times "steorh\t" 4} } */ +/* { dg-final { scan-assembler-times "steorlh\t" 4} } */ -/* { dg-final { scan-assembler-times "ldeor\t" 16} */ +/* { dg-final { scan-assembler-times "ldeor\t" 8} */ /* { dg-final { scan-assembler-times "ldeora\t" 32} } */ -/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */ +/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */ /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */ +/* { dg-final { scan-assembler-times "steor\t" 8} */ +/* { dg-final { scan-assembler-times "steorl\t" 8} } */ /* { dg-final { scan-assembler-not "ldaxr\t" } } */ /* { dg-final { scan-assembler-not "stlxr\t" } } */
Hi Richard, On Wed, Oct 31, 2018 at 10:27:59AM +0000, Richard Henderson wrote: > On 10/30/18 8:32 PM, James Greenhalgh wrote: > > On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote: > >> When the result of an operation is not used, we can ignore the > >> result by storing to XZR. For two of the memory models, using > >> XZR with LD<op> has a preferred assembler alias, ST<op>. > > > > ST<op> has different semantics to LD<op>, in particular, ST<op> is not > > ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics. > > > > The relevant Arm Arm text is: > > > > If the destination register is not one of WZR or XZR, LDADDA and > > LDADDAL load from memory with acquire semantics > > > > LDADDL and LDADDAL store to memory with release semantics. > > > > LDADD has no memory ordering requirements. > > > > I'm taking this to mean that even if the result is unused, using XZR is not > > a valid transformation; it weakens the expected acquire semantics to > > unordered. > > > > The example I have from Will Deacon on an internal bug database is: > > > > P0 (atomic_int* y,atomic_int* x) { > > atomic_store_explicit(x,1,memory_order_relaxed); > > atomic_thread_fence(memory_order_release); > > atomic_store_explicit(y,1,memory_order_relaxed); > > } > > > > P1 (atomic_int* y,atomic_int* x) { > > int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed); > > atomic_thread_fence(memory_order_acquire); > > int r1 = atomic_load_explicit(x,memory_order_relaxed); > > } > > > > The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal. > > > > This example comes from a while back in my memory; so copying Will for > > any more detailed questions. > > > > My impression is that this transformation is not safe, and so the patch is > > not OK. > > Here's a revised patch. > > Use ST<op> for relaxed and release orderings, retain the (non-xzr) scratch > register for other orderings. But the scratch need not be early-clobber, since > there's no mid-point of half-consumed inputs. The example test above uses relaxed atomics in conjunction with an acquire fence, so I don't think we can actually use ST<op> at all without a change to the language specification. I previouslyyallocated P0861 for this purpose but never got a chance to write it up... Perhaps the issue is a bit clearer with an additional thread (not often I say that!): P0 (atomic_int* y,atomic_int* x) { atomic_store_explicit(x,1,memory_order_relaxed); atomic_thread_fence(memory_order_release); atomic_store_explicit(y,1,memory_order_relaxed); } P1 (atomic_int* y,atomic_int* x) { atomic_fetch_add_explicit(y,1,memory_order_relaxed); // STADD atomic_thread_fence(memory_order_acquire); int r0 = atomic_load_explicit(x,memory_order_relaxed); } P2 (atomic_int* y) { int r1 = atomic_load_explicit(y,memory_order_relaxed); } My understanding is that it is forbidden for r0 == 0 and r1 == 2 after this test has executed. However, if the relaxed add in P1 compiles to STADD and the subsequent acquire fence is compiled as DMB LD, then we don't have any ordering guarantees in P1 and the forbidden result could be observed. Will
On 10/31/18 3:04 PM, Will Deacon wrote: > The example test above uses relaxed atomics in conjunction with an acquire > fence, so I don't think we can actually use ST<op> at all without a change > to the language specification. I previouslyyallocated P0861 for this purpose > but never got a chance to write it up... > > Perhaps the issue is a bit clearer with an additional thread (not often I > say that!): > > > P0 (atomic_int* y,atomic_int* x) { > atomic_store_explicit(x,1,memory_order_relaxed); > atomic_thread_fence(memory_order_release); > atomic_store_explicit(y,1,memory_order_relaxed); > } > > P1 (atomic_int* y,atomic_int* x) { > atomic_fetch_add_explicit(y,1,memory_order_relaxed); // STADD > atomic_thread_fence(memory_order_acquire); > int r0 = atomic_load_explicit(x,memory_order_relaxed); > } > > P2 (atomic_int* y) { > int r1 = atomic_load_explicit(y,memory_order_relaxed); > } > > > My understanding is that it is forbidden for r0 == 0 and r1 == 2 after > this test has executed. However, if the relaxed add in P1 compiles to > STADD and the subsequent acquire fence is compiled as DMB LD, then we > don't have any ordering guarantees in P1 and the forbidden result could > be observed. I suppose I don't understand exactly what you're saying. I can see that, yes, if you split the fetch-add from the acquire in P1 you get the incorrect results you describe. But isn't that a bug in the test itself? Why would not the only correct version have P1 (atomic_int* y, atomic_int* x) { atomic_fetch_add_explicit(y, 1, memory_order_acquire); int r0 = atomic_load_explicit(x, memory_order_relaxed); } at which point we won't use STADD for the fetch-add, but LDADDA. If the problem is more fundamental than this, would you have another go at explaining? In particular, I don't see the difference between ldadd val, scratch, [base] vs stadd val, [base] and ldaddl val, scratch, [base] vs staddl val, [base] where both pairs of instructions have the same memory ordering semantics. Currently we are always producing the ld version of each pair. r~
On Wed, Oct 31, 2018 at 04:38:53PM +0000, Richard Henderson wrote: > On 10/31/18 3:04 PM, Will Deacon wrote: > > The example test above uses relaxed atomics in conjunction with an acquire > > fence, so I don't think we can actually use ST<op> at all without a change > > to the language specification. I previouslyyallocated P0861 for this purpose > > but never got a chance to write it up... > > > > Perhaps the issue is a bit clearer with an additional thread (not often I > > say that!): > > > > > > P0 (atomic_int* y,atomic_int* x) { > > atomic_store_explicit(x,1,memory_order_relaxed); > > atomic_thread_fence(memory_order_release); > > atomic_store_explicit(y,1,memory_order_relaxed); > > } > > > > P1 (atomic_int* y,atomic_int* x) { > > atomic_fetch_add_explicit(y,1,memory_order_relaxed); // STADD > > atomic_thread_fence(memory_order_acquire); > > int r0 = atomic_load_explicit(x,memory_order_relaxed); > > } > > > > P2 (atomic_int* y) { > > int r1 = atomic_load_explicit(y,memory_order_relaxed); > > } > > > > > > My understanding is that it is forbidden for r0 == 0 and r1 == 2 after > > this test has executed. However, if the relaxed add in P1 compiles to > > STADD and the subsequent acquire fence is compiled as DMB LD, then we > > don't have any ordering guarantees in P1 and the forbidden result could > > be observed. > > I suppose I don't understand exactly what you're saying. Apologies, I'm probably not explaining things very well. I'm trying to avoid getting into the C11 memory model relations if I can help it, hence the example. > I can see that, yes, if you split the fetch-add from the acquire in P1 you get > the incorrect results you describe. But isn't that a bug in the test itself? Per the C11 memory model, the test above is well-defined and if r1 == 2 then it is required that r0 == 1. With your proposal, this is not guaranteed for AArch64, and it would be possible to end up with r1 == 2 and r0 == 0. > Why would not the only correct version have > > P1 (atomic_int* y, atomic_int* x) { > atomic_fetch_add_explicit(y, 1, memory_order_acquire); > int r0 = atomic_load_explicit(x, memory_order_relaxed); > } > > at which point we won't use STADD for the fetch-add, but LDADDA. That would indeed work correctly, but the problem is that the C11 memory model doesn't rule out the previous test as something which isn't portable. > If the problem is more fundamental than this, would you have another go at > explaining? In particular, I don't see the difference between > > ldadd val, scratch, [base] > vs > stadd val, [base] > > and > > ldaddl val, scratch, [base] > vs > staddl val, [base] > > where both pairs of instructions have the same memory ordering semantics. > Currently we are always producing the ld version of each pair. Aha, maybe this is the problem. An acquire fence on AArch64 is implemented using a DMB LD instruction, which orders prior reads against subsequent reads and writes. However, the architecture says: | The ST<OP> instructions, and LD<OP> instructions where the destination | register is WZR or XZR, are not regarded as doing a read for the purpose | of a DMB LD barrier. and so therefore an ST atomic is not affected by a subsequent acquire fence, whereas an LD atomic is. Does that help at all? Will
On 10/31/18 5:51 PM, Will Deacon wrote: > Aha, maybe this is the problem. An acquire fence on AArch64 is implemented > using a DMB LD instruction, which orders prior reads against subsequent > reads and writes. However, the architecture says: > > | The ST<OP> instructions, and LD<OP> instructions where the destination > | register is WZR or XZR, are not regarded as doing a read for the purpose > | of a DMB LD barrier. > > and so therefore an ST atomic is not affected by a subsequent acquire fence, > whereas an LD atomic is. > > Does that help at all? Yes it does, thanks. Lest this come up again, let's document this. r~ >From 804f690fc8ebaa436b97ea4c9fef830f9cd2b873 Mon Sep 17 00:00:00 2001 From: Richard Henderson <richard.henderson@linaro.org> Date: Wed, 19 Sep 2018 22:18:09 +0000 Subject: [PATCH] aarch64: Remove early clobber from ATOMIC_LDOP scratch * config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): The scratch register need not be early-clobber. Document the reason why we cannot use ST<OP>. --- gcc/config/aarch64/atomics.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md index 2198649b1be..8944b5690b5 100644 --- a/gcc/config/aarch64/atomics.md +++ b/gcc/config/aarch64/atomics.md @@ -263,6 +263,18 @@ } ) +;; It is tempting to want to use ST<OP> here for relaxed and release +;; memory models here. However, that is incompatible with the C++ +;; memory model for the following case: +;; +;; atomic_fetch_add(ptr, 1, memory_order_relaxed); +;; atomic_thread_fence(memory_order_acquire); +;; +;; The problem is that the architecture says that ST<OP> (and LD<OP> +;; insns where the destination is XZR) are not regarded as a read. +;; However we also implement the acquire memory barrier with DMB LD, +;; and so the ST<OP> is not blocked by the barrier. + (define_insn "aarch64_atomic_<atomic_ldoptab><mode>_lse" [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q") (unspec_volatile:ALLI @@ -270,7 +282,7 @@ (match_operand:ALLI 1 "register_operand" "r") (match_operand:SI 2 "const_int_operand")] ATOMIC_LDOP)) - (clobber (match_scratch:ALLI 3 "=&r"))] + (clobber (match_scratch:ALLI 3 "=r"))] "TARGET_LSE" { enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
On Wed, Oct 31, 2018 at 04:55:26PM -0500, Richard Henderson wrote: > On 10/31/18 5:51 PM, Will Deacon wrote: > > Aha, maybe this is the problem. An acquire fence on AArch64 is implemented > > using a DMB LD instruction, which orders prior reads against subsequent > > reads and writes. However, the architecture says: > > > > | The ST<OP> instructions, and LD<OP> instructions where the destination > > | register is WZR or XZR, are not regarded as doing a read for the purpose > > | of a DMB LD barrier. > > > > and so therefore an ST atomic is not affected by a subsequent acquire fence, > > whereas an LD atomic is. > > > > Does that help at all? > > Yes it does, thanks. Lest this come up again, let's document this. Good idea. OK. James > >From 804f690fc8ebaa436b97ea4c9fef830f9cd2b873 Mon Sep 17 00:00:00 2001 > From: Richard Henderson <richard.henderson@linaro.org> > Date: Wed, 19 Sep 2018 22:18:09 +0000 > Subject: [PATCH] aarch64: Remove early clobber from ATOMIC_LDOP scratch > > * config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): > The scratch register need not be early-clobber. Document the reason > why we cannot use ST<OP>. > --- > gcc/config/aarch64/atomics.md | 14 +++++++++++++- > 1 file changed, 13 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md > index 2198649b1be..8944b5690b5 100644 > --- a/gcc/config/aarch64/atomics.md > +++ b/gcc/config/aarch64/atomics.md > @@ -263,6 +263,18 @@ > } > ) > > +;; It is tempting to want to use ST<OP> here for relaxed and release > +;; memory models here. However, that is incompatible with the C++ > +;; memory model for the following case: > +;; > +;; atomic_fetch_add(ptr, 1, memory_order_relaxed); > +;; atomic_thread_fence(memory_order_acquire); > +;; > +;; The problem is that the architecture says that ST<OP> (and LD<OP> > +;; insns where the destination is XZR) are not regarded as a read. > +;; However we also implement the acquire memory barrier with DMB LD, > +;; and so the ST<OP> is not blocked by the barrier. > + > (define_insn "aarch64_atomic_<atomic_ldoptab><mode>_lse" > [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q") > (unspec_volatile:ALLI > @@ -270,7 +282,7 @@ > (match_operand:ALLI 1 "register_operand" "r") > (match_operand:SI 2 "const_int_operand")] > ATOMIC_LDOP)) > - (clobber (match_scratch:ALLI 3 "=&r"))] > + (clobber (match_scratch:ALLI 3 "=r"))] > "TARGET_LSE" > { > enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > -- > 2.17.2 >
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c index 4b2282c6861..db2206186b4 100644 --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c @@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN) TEST (sub_load, SUB_LOAD) TEST (sub_load_notreturn, SUB_LOAD_NORETURN) -/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */ -/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */ +/* { dg-final { scan-assembler-times "staddb\t" 8} } */ +/* { dg-final { scan-assembler-times "staddlb\t" 8} } */ -/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */ -/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */ +/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */ /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */ +/* { dg-final { scan-assembler-times "staddh\t" 8} } */ +/* { dg-final { scan-assembler-times "staddlh\t" 8} } */ -/* { dg-final { scan-assembler-times "ldadd\t" 32} } */ +/* { dg-final { scan-assembler-times "ldadd\t" 16} } */ /* { dg-final { scan-assembler-times "ldadda\t" 64} } */ -/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */ +/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */ /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */ +/* { dg-final { scan-assembler-times "stadd\t" 16} } */ +/* { dg-final { scan-assembler-times "staddl\t" 16} } */ /* { dg-final { scan-assembler-not "ldaxr\t" } } */ /* { dg-final { scan-assembler-not "stlxr\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c index 4879d52b9b4..b8a53e0a676 100644 --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c @@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN) /* Load-OR. */ -/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */ -/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */ +/* { dg-final { scan-assembler-times "stsetb\t" 4} } */ +/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */ -/* { dg-final { scan-assembler-times "ldseth\t" 8} } */ +/* { dg-final { scan-assembler-times "ldseth\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */ -/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */ /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */ +/* { dg-final { scan-assembler-times "stseth\t" 4} } */ +/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */ -/* { dg-final { scan-assembler-times "ldset\t" 16} } */ +/* { dg-final { scan-assembler-times "ldset\t" 8} } */ /* { dg-final { scan-assembler-times "ldseta\t" 32} } */ -/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */ +/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */ /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */ +/* { dg-final { scan-assembler-times "stset\t" 8} } */ +/* { dg-final { scan-assembler-times "stsetl\t" 8} } */ /* Load-AND. */ -/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */ /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */ -/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */ /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */ +/* { dg-final { scan-assembler-times "stclrb\t" 4} } */ +/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */ -/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */ /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */ -/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */ /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */ +/* { dg-final { scan-assembler-times "stclrh\t" 4} } */ +/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */ -/* { dg-final { scan-assembler-times "ldclr\t" 16} */ +/* { dg-final { scan-assembler-times "ldclr\t" 8} */ /* { dg-final { scan-assembler-times "ldclra\t" 32} } */ -/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */ +/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */ /* { dg-final { scan-assembler-times "ldclral\t" 32} } */ +/* { dg-final { scan-assembler-times "stclr\t" 8} */ +/* { dg-final { scan-assembler-times "stclrl\t" 8} } */ /* Load-XOR. */ -/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */ /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */ -/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */ /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */ +/* { dg-final { scan-assembler-times "steorb\t" 4} } */ +/* { dg-final { scan-assembler-times "steorlb\t" 4} } */ -/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */ /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */ -/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */ +/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */ /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */ +/* { dg-final { scan-assembler-times "steorh\t" 4} } */ +/* { dg-final { scan-assembler-times "steorlh\t" 4} } */ -/* { dg-final { scan-assembler-times "ldeor\t" 16} */ +/* { dg-final { scan-assembler-times "ldeor\t" 8} */ /* { dg-final { scan-assembler-times "ldeora\t" 32} } */ -/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */ +/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */ /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */ +/* { dg-final { scan-assembler-times "steor\t" 8} */ +/* { dg-final { scan-assembler-times "steorl\t" 8} } */ /* { dg-final { scan-assembler-not "ldaxr\t" } } */ /* { dg-final { scan-assembler-not "stlxr\t" } } */ diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md index 2198649b1be..ee662a4480e 100644 --- a/gcc/config/aarch64/atomics.md +++ b/gcc/config/aarch64/atomics.md @@ -269,19 +269,22 @@ [(match_dup 0) (match_operand:ALLI 1 "register_operand" "r") (match_operand:SI 2 "const_int_operand")] - ATOMIC_LDOP)) - (clobber (match_scratch:ALLI 3 "=&r"))] + ATOMIC_LDOP))] "TARGET_LSE" { enum memmodel model = memmodel_from_int (INTVAL (operands[2])); + /* Note that while ST<op> is an alias for LD<op> with the second + operand as XZR, the assember only defines them for the RELAXED + and REL models. But there's nothing that prevents us from explicitly + using XZR with LD<op> for the ACQ and ACQ_REL models. */ if (is_mm_relaxed (model)) - return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0"; + return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0"; else if (is_mm_release (model)) - return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0"; + return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0"; else if (is_mm_acquire (model) || is_mm_consume (model)) - return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0"; + return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, <w>zr, %0"; else - return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0"; + return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, <w>zr, %0"; } )