Message ID | 20220728063120.2867508-6-npiggin@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | powerpc: alternate queued spinlock implementation | expand |
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > This uses more optimal ll/sc style access patterns (rather than > cmpxchg), and also sets the EH=1 lock hint on those operations > which acquire ownership of the lock. > --- > arch/powerpc/include/asm/qspinlock.h | 25 +++++-- > arch/powerpc/include/asm/qspinlock_types.h | 6 +- > arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- > 3 files changed, 79 insertions(+), 33 deletions(-) > > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h > index 79a1936fb68d..3ab354159e5e 100644 > --- a/arch/powerpc/include/asm/qspinlock.h > +++ b/arch/powerpc/include/asm/qspinlock.h > @@ -2,28 +2,43 @@ > #ifndef _ASM_POWERPC_QSPINLOCK_H > #define _ASM_POWERPC_QSPINLOCK_H > > -#include <linux/atomic.h> > #include <linux/compiler.h> > #include <asm/qspinlock_types.h> > > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > { > - return atomic_read(&lock->val); > + return READ_ONCE(lock->val); > } > > static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) > { > - return !atomic_read(&lock.val); > + return !lock.val; > } > > static __always_inline int queued_spin_is_contended(struct qspinlock *lock) > { > - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); > + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); > } > > static __always_inline int queued_spin_trylock(struct qspinlock *lock) > { > - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + asm volatile( > +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" > +" cmpwi 0,%0,0 \n" > +" bne- 2f \n" > +" stwcx. %2,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r" (new), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) btw IS_ENABLED() already returns 1 or 0 > + : "cr0", "memory"); This is the ISA's "test and set" atomic primitive. Do you think it would be worth seperating it as a helper? > + > + if (likely(prev == 0)) > return 1; > return 0; same optional style nit: return likely(prev == 0); > } > diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h > index 3425dab42576..210adf05b235 100644 > --- a/arch/powerpc/include/asm/qspinlock_types.h > +++ b/arch/powerpc/include/asm/qspinlock_types.h > @@ -7,7 +7,7 @@ > > typedef struct qspinlock { > union { > - atomic_t val; > + u32 val; > > #ifdef __LITTLE_ENDIAN > struct { > @@ -23,10 +23,10 @@ typedef struct qspinlock { > }; > } arch_spinlock_t; > > -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } > +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } > > /* > - * Bitfields in the atomic value: > + * Bitfields in the lock word: > * > * 0: locked bit > * 16-31: tail cpu (+1) > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > index 5ebb88d95636..7c71e5e287df 100644 > --- a/arch/powerpc/lib/qspinlock.c > +++ b/arch/powerpc/lib/qspinlock.c > @@ -1,5 +1,4 @@ > // SPDX-License-Identifier: GPL-2.0-or-later > -#include <linux/atomic.h> > #include <linux/bug.h> > #include <linux/compiler.h> > #include <linux/export.h> > @@ -22,32 +21,59 @@ struct qnodes { > > static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); > > -static inline int encode_tail_cpu(void) > +static inline u32 encode_tail_cpu(void) > { > return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; > } > > -static inline int get_tail_cpu(int val) > +static inline int get_tail_cpu(u32 val) > { > return (val >> _Q_TAIL_CPU_OFFSET) - 1; > } > > /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */ I think you missed deleting the above line. > +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ > static __always_inline void lock_set_locked(struct qspinlock *lock) > { > - atomic_or(_Q_LOCKED_VAL, &lock->val); > - __atomic_acquire_fence(); > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + asm volatile( > +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" > +" or %0,%0,%2 \n" > +" stwcx. %0,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r" (new), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > } This is pretty similar with the DEFINE_TESTOP() pattern from arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except for word instead of double word. Do you think it's possible / beneficial to make use of those macros? > > -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ > -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val) > +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ > +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) > { > - int newval = _Q_LOCKED_VAL; > - > - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + BUG_ON(old & _Q_LOCKED_VAL); The BUG_ON() could have been introduced in an earlier patch I think. > + > + asm volatile( > +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" > +" cmpw 0,%0,%2 \n" > +" bne- 2f \n" > +" stwcx. %3,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r"(old), "r" (new), Could this be like "r"(_Q_TAIL_CPU_MASK) below? i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > + > + if (likely(prev == old)) > return 1; > - else > - return 0; > + return 0; > } > > /* > @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va > * This provides a release barrier for publishing node, and an acquire barrier Does the comment mean there needs to be an acquire barrier in this assembly? > * for getting the old node. > */ > -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) > +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) > { > - for (;;) { > - int val = atomic_read(&lock->val); > - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; > - int old; > - > - old = atomic_cmpxchg(&lock->val, val, newval); > - if (old == val) > - return old; > - } > + u32 prev, tmp; > + > + asm volatile( > +"\t" PPC_RELEASE_BARRIER " \n" > +"1: lwarx %0,0,%2 # publish_tail_cpu \n" > +" andc %1,%0,%4 \n" > +" or %1,%1,%3 \n" > +" stwcx. %1,0,%2 \n" > +" bne- 1b \n" > + : "=&r" (prev), "=&r"(tmp) > + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) > + : "cr0", "memory"); > + > + return prev; > } > > -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) > +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) > { > int cpu = get_tail_cpu(val); > struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); > @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) > { > struct qnodes *qnodesp; > struct qnode *next, *node; > - int val, old, tail; > + u32 val, old, tail; > int idx; > > BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); > @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) > } > > /* We're at the head of the waitqueue, wait for the lock. */ > - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) > + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) > cpu_relax(); > > /* If we're the last queued, must clean up the tail. */
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: [resend as utf-8, not utf-7] > This uses more optimal ll/sc style access patterns (rather than > cmpxchg), and also sets the EH=1 lock hint on those operations > which acquire ownership of the lock. > --- > arch/powerpc/include/asm/qspinlock.h | 25 +++++-- > arch/powerpc/include/asm/qspinlock_types.h | 6 +- > arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- > 3 files changed, 79 insertions(+), 33 deletions(-) > > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h > index 79a1936fb68d..3ab354159e5e 100644 > --- a/arch/powerpc/include/asm/qspinlock.h > +++ b/arch/powerpc/include/asm/qspinlock.h > @@ -2,28 +2,43 @@ > #ifndef _ASM_POWERPC_QSPINLOCK_H > #define _ASM_POWERPC_QSPINLOCK_H > > -#include <linux/atomic.h> > #include <linux/compiler.h> > #include <asm/qspinlock_types.h> > > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > { > - return atomic_read(&lock->val); > + return READ_ONCE(lock->val); > } > > static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) > { > - return !atomic_read(&lock.val); > + return !lock.val; > } > > static __always_inline int queued_spin_is_contended(struct qspinlock *lock) > { > - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); > + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); > } > > static __always_inline int queued_spin_trylock(struct qspinlock *lock) > { > - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + asm volatile( > +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" > +" cmpwi 0,%0,0 \n" > +" bne- 2f \n" > +" stwcx. %2,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r" (new), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) btw IS_ENABLED() already returns 1 or 0 > + : "cr0", "memory"); This is the ISA's "test and set" atomic primitive. Do you think it would be worth seperating it as a helper? > + > + if (likely(prev == 0)) > return 1; > return 0; same optional style nit: return likely(prev == 0); > } > diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h > index 3425dab42576..210adf05b235 100644 > --- a/arch/powerpc/include/asm/qspinlock_types.h > +++ b/arch/powerpc/include/asm/qspinlock_types.h > @@ -7,7 +7,7 @@ > > typedef struct qspinlock { > union { > - atomic_t val; > + u32 val; > > #ifdef __LITTLE_ENDIAN > struct { > @@ -23,10 +23,10 @@ typedef struct qspinlock { > }; > } arch_spinlock_t; > > -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } > +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } > > /* > - * Bitfields in the atomic value: > + * Bitfields in the lock word: > * > * 0: locked bit > * 16-31: tail cpu (+1) > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > index 5ebb88d95636..7c71e5e287df 100644 > --- a/arch/powerpc/lib/qspinlock.c > +++ b/arch/powerpc/lib/qspinlock.c > @@ -1,5 +1,4 @@ > // SPDX-License-Identifier: GPL-2.0-or-later > -#include <linux/atomic.h> > #include <linux/bug.h> > #include <linux/compiler.h> > #include <linux/export.h> > @@ -22,32 +21,59 @@ struct qnodes { > > static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); > > -static inline int encode_tail_cpu(void) > +static inline u32 encode_tail_cpu(void) > { > return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; > } > > -static inline int get_tail_cpu(int val) > +static inline int get_tail_cpu(u32 val) > { > return (val >> _Q_TAIL_CPU_OFFSET) - 1; > } > > /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */ I think you missed deleting the above line. > +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ > static __always_inline void lock_set_locked(struct qspinlock *lock) > { > - atomic_or(_Q_LOCKED_VAL, &lock->val); > - __atomic_acquire_fence(); > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + asm volatile( > +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" > +" or %0,%0,%2 \n" > +" stwcx. %0,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r" (new), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > } This is pretty similar with the DEFINE_TESTOP() pattern from arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except for word instead of double word. Do you think it's possible / beneficial to make use of those macros? > > -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ > -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val) > +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ > +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) > { > - int newval = _Q_LOCKED_VAL; > - > - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) > + u32 new = _Q_LOCKED_VAL; > + u32 prev; > + > + BUG_ON(old & _Q_LOCKED_VAL); The BUG_ON() could have been introduced in an earlier patch I think. > + > + asm volatile( > +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" > +" cmpw 0,%0,%2 \n" > +" bne- 2f \n" > +" stwcx. %3,0,%1 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" > + : "=&r" (prev) > + : "r" (&lock->val), "r"(old), "r" (new), Could this be like "r"(_Q_TAIL_CPU_MASK) below? i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > + > + if (likely(prev == old)) > return 1; > - else > - return 0; > + return 0; > } > > /* > @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va > * This provides a release barrier for publishing node, and an acquire barrier Does the comment mean there needs to be an acquire barrier in this assembly? > * for getting the old node. > */ > -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) > +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) > { > - for (;;) { > - int val = atomic_read(&lock->val); > - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; > - int old; > - > - old = atomic_cmpxchg(&lock->val, val, newval); > - if (old == val) > - return old; > - } > + u32 prev, tmp; > + > + asm volatile( > +"\t" PPC_RELEASE_BARRIER " \n" > +"1: lwarx %0,0,%2 # publish_tail_cpu \n" > +" andc %1,%0,%4 \n" > +" or %1,%1,%3 \n" > +" stwcx. %1,0,%2 \n" > +" bne- 1b \n" > + : "=&r" (prev), "=&r"(tmp) > + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) > + : "cr0", "memory"); > + > + return prev; > } > > -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) > +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) > { > int cpu = get_tail_cpu(val); > struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); > @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) > { > struct qnodes *qnodesp; > struct qnode *next, *node; > - int val, old, tail; > + u32 val, old, tail; > int idx; > > BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); > @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) > } > > /* We're at the head of the waitqueue, wait for the lock. */ > - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) > + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) > cpu_relax(); > > /* If we're the last queued, must clean up the tail. */
Le 10/11/2022 à 01:39, Jordan Niethe a écrit : > On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > [resend as utf-8, not utf-7] >> This uses more optimal ll/sc style access patterns (rather than >> cmpxchg), and also sets the EH=1 lock hint on those operations >> which acquire ownership of the lock. >> --- >> arch/powerpc/include/asm/qspinlock.h | 25 +++++-- >> arch/powerpc/include/asm/qspinlock_types.h | 6 +- >> arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- >> 3 files changed, 79 insertions(+), 33 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h >> index 79a1936fb68d..3ab354159e5e 100644 >> --- a/arch/powerpc/include/asm/qspinlock.h >> +++ b/arch/powerpc/include/asm/qspinlock.h >> @@ -2,28 +2,43 @@ >> #ifndef _ASM_POWERPC_QSPINLOCK_H >> #define _ASM_POWERPC_QSPINLOCK_H >> >> -#include <linux/atomic.h> >> #include <linux/compiler.h> >> #include <asm/qspinlock_types.h> >> >> static __always_inline int queued_spin_is_locked(struct qspinlock *lock) >> { >> - return atomic_read(&lock->val); >> + return READ_ONCE(lock->val); >> } >> >> static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) >> { >> - return !atomic_read(&lock.val); >> + return !lock.val; >> } >> >> static __always_inline int queued_spin_is_contended(struct qspinlock *lock) >> { >> - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); >> + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); >> } >> >> static __always_inline int queued_spin_trylock(struct qspinlock *lock) >> { >> - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) >> + u32 new = _Q_LOCKED_VAL; >> + u32 prev; >> + >> + asm volatile( >> +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" >> +" cmpwi 0,%0,0 \n" >> +" bne- 2f \n" >> +" stwcx. %2,0,%1 \n" >> +" bne- 1b \n" >> +"\t" PPC_ACQUIRE_BARRIER " \n" >> +"2: \n" >> + : "=&r" (prev) >> + : "r" (&lock->val), "r" (new), >> + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > btw IS_ENABLED() already returns 1 or 0 > >> + : "cr0", "memory"); > > This is the ISA's "test and set" atomic primitive. Do you think it would be worth seperating it as a helper? > >> + >> + if (likely(prev == 0)) >> return 1; >> return 0; > > same optional style nit: return likely(prev == 0); return likely(!prev); > >> } >> diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h >> index 3425dab42576..210adf05b235 100644 >> --- a/arch/powerpc/include/asm/qspinlock_types.h >> +++ b/arch/powerpc/include/asm/qspinlock_types.h >> @@ -7,7 +7,7 @@ >> >> typedef struct qspinlock { >> union { >> - atomic_t val; >> + u32 val; >> >> #ifdef __LITTLE_ENDIAN >> struct { >> @@ -23,10 +23,10 @@ typedef struct qspinlock { >> }; >> } arch_spinlock_t; >> >> -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } >> +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } >> >> /* >> - * Bitfields in the atomic value: >> + * Bitfields in the lock word: >> * >> * 0: locked bit >> * 16-31: tail cpu (+1) >> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c >> index 5ebb88d95636..7c71e5e287df 100644 >> --- a/arch/powerpc/lib/qspinlock.c >> +++ b/arch/powerpc/lib/qspinlock.c >> @@ -1,5 +1,4 @@ >> // SPDX-License-Identifier: GPL-2.0-or-later >> -#include <linux/atomic.h> >> #include <linux/bug.h> >> #include <linux/compiler.h> >> #include <linux/export.h> >> @@ -22,32 +21,59 @@ struct qnodes { >> >> static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); >> >> -static inline int encode_tail_cpu(void) >> +static inline u32 encode_tail_cpu(void) >> { >> return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; >> } >> >> -static inline int get_tail_cpu(int val) >> +static inline int get_tail_cpu(u32 val) >> { >> return (val >> _Q_TAIL_CPU_OFFSET) - 1; >> } >> >> /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */ > > I think you missed deleting the above line. > >> +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ >> static __always_inline void lock_set_locked(struct qspinlock *lock) >> { >> - atomic_or(_Q_LOCKED_VAL, &lock->val); >> - __atomic_acquire_fence(); >> + u32 new = _Q_LOCKED_VAL; >> + u32 prev; >> + >> + asm volatile( >> +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" >> +" or %0,%0,%2 \n" >> +" stwcx. %0,0,%1 \n" >> +" bne- 1b \n" >> +"\t" PPC_ACQUIRE_BARRIER " \n" >> + : "=&r" (prev) >> + : "r" (&lock->val), "r" (new), >> + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) >> + : "cr0", "memory"); >> } > > This is pretty similar with the DEFINE_TESTOP() pattern from > arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except for > word instead of double word. Do you think it's possible / beneficial to make > use of those macros? > > >> >> -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ >> -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val) >> +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ >> +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) >> { >> - int newval = _Q_LOCKED_VAL; >> - >> - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) >> + u32 new = _Q_LOCKED_VAL; >> + u32 prev; >> + >> + BUG_ON(old & _Q_LOCKED_VAL); > > The BUG_ON() could have been introduced in an earlier patch I think. Can we avoid the BUG_ON() at all and replace by a WARN_ON ? > >> + >> + asm volatile( >> +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" >> +" cmpw 0,%0,%2 \n" >> +" bne- 2f \n" >> +" stwcx. %3,0,%1 \n" >> +" bne- 1b \n" >> +"\t" PPC_ACQUIRE_BARRIER " \n" >> +"2: \n" >> + : "=&r" (prev) >> + : "r" (&lock->val), "r"(old), "r" (new), > > Could this be like "r"(_Q_TAIL_CPU_MASK) below? > i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. > >> + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) >> + : "cr0", "memory"); >> + >> + if (likely(prev == old)) >> return 1; >> - else >> - return 0; >> + return 0; >> } >> >> /* >> @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va >> * This provides a release barrier for publishing node, and an acquire barrier > > Does the comment mean there needs to be an acquire barrier in this assembly? > > >> * for getting the old node. >> */ >> -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) >> +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) >> { >> - for (;;) { >> - int val = atomic_read(&lock->val); >> - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; >> - int old; >> - >> - old = atomic_cmpxchg(&lock->val, val, newval); >> - if (old == val) >> - return old; >> - } >> + u32 prev, tmp; >> + >> + asm volatile( >> +"\t" PPC_RELEASE_BARRIER " \n" >> +"1: lwarx %0,0,%2 # publish_tail_cpu \n" >> +" andc %1,%0,%4 \n" >> +" or %1,%1,%3 \n" >> +" stwcx. %1,0,%2 \n" >> +" bne- 1b \n" >> + : "=&r" (prev), "=&r"(tmp) >> + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) >> + : "cr0", "memory"); >> + >> + return prev; >> } >> >> -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) >> +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) >> { >> int cpu = get_tail_cpu(val); >> struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); >> @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) >> { >> struct qnodes *qnodesp; >> struct qnode *next, *node; >> - int val, old, tail; >> + u32 val, old, tail; >> int idx; >> >> BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); >> @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) >> } >> >> /* We're at the head of the waitqueue, wait for the lock. */ >> - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) >> + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) >> cpu_relax(); >> >> /* If we're the last queued, must clean up the tail. */ >
On Thu Nov 10, 2022 at 10:39 AM AEST, Jordan Niethe wrote: > On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > [resend as utf-8, not utf-7] > > This uses more optimal ll/sc style access patterns (rather than > > cmpxchg), and also sets the EH=1 lock hint on those operations > > which acquire ownership of the lock. > > --- > > arch/powerpc/include/asm/qspinlock.h | 25 +++++-- > > arch/powerpc/include/asm/qspinlock_types.h | 6 +- > > arch/powerpc/lib/qspinlock.c | 81 +++++++++++++++------- > > 3 files changed, 79 insertions(+), 33 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h > > index 79a1936fb68d..3ab354159e5e 100644 > > --- a/arch/powerpc/include/asm/qspinlock.h > > +++ b/arch/powerpc/include/asm/qspinlock.h > > @@ -2,28 +2,43 @@ > > #ifndef _ASM_POWERPC_QSPINLOCK_H > > #define _ASM_POWERPC_QSPINLOCK_H > > > > -#include <linux/atomic.h> > > #include <linux/compiler.h> > > #include <asm/qspinlock_types.h> > > > > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > > { > > - return atomic_read(&lock->val); > > + return READ_ONCE(lock->val); > > } > > > > static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) > > { > > - return !atomic_read(&lock.val); > > + return !lock.val; > > } > > > > static __always_inline int queued_spin_is_contended(struct qspinlock *lock) > > { > > - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); > > + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); > > } > > > > static __always_inline int queued_spin_trylock(struct qspinlock *lock) > > { > > - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) > > + u32 new = _Q_LOCKED_VAL; > > + u32 prev; > > + > > + asm volatile( > > +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" > > +" cmpwi 0,%0,0 \n" > > +" bne- 2f \n" > > +" stwcx. %2,0,%1 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " \n" > > +"2: \n" > > + : "=&r" (prev) > > + : "r" (&lock->val), "r" (new), > > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > btw IS_ENABLED() already returns 1 or 0 I guess we already do that in atomic.h too. Okay. > > + : "cr0", "memory"); > > This is the ISA's "test and set" atomic primitive. Do you think it would be worth seperating it as a helper? It ends up getting more complex as we go. I might leave some of these primitives open coded for now, we could possibly look at providing them or reusing more generic primitives after the series though. > > + > > + if (likely(prev == 0)) > > return 1; > > return 0; > > same optional style nit: return likely(prev == 0); Will do. > > > } > > diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h > > index 3425dab42576..210adf05b235 100644 > > --- a/arch/powerpc/include/asm/qspinlock_types.h > > +++ b/arch/powerpc/include/asm/qspinlock_types.h > > @@ -7,7 +7,7 @@ > > > > typedef struct qspinlock { > > union { > > - atomic_t val; > > + u32 val; > > > > #ifdef __LITTLE_ENDIAN > > struct { > > @@ -23,10 +23,10 @@ typedef struct qspinlock { > > }; > > } arch_spinlock_t; > > > > -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } > > +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } > > > > /* > > - * Bitfields in the atomic value: > > + * Bitfields in the lock word: > > * > > * 0: locked bit > > * 16-31: tail cpu (+1) > > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > > index 5ebb88d95636..7c71e5e287df 100644 > > --- a/arch/powerpc/lib/qspinlock.c > > +++ b/arch/powerpc/lib/qspinlock.c > > @@ -1,5 +1,4 @@ > > // SPDX-License-Identifier: GPL-2.0-or-later > > -#include <linux/atomic.h> > > #include <linux/bug.h> > > #include <linux/compiler.h> > > #include <linux/export.h> > > @@ -22,32 +21,59 @@ struct qnodes { > > > > static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); > > > > -static inline int encode_tail_cpu(void) > > +static inline u32 encode_tail_cpu(void) > > { > > return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; > > } > > > > -static inline int get_tail_cpu(int val) > > +static inline int get_tail_cpu(u32 val) > > { > > return (val >> _Q_TAIL_CPU_OFFSET) - 1; > > } > > > > /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */ > > I think you missed deleting the above line. > > > +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ > > static __always_inline void lock_set_locked(struct qspinlock *lock) > > { > > - atomic_or(_Q_LOCKED_VAL, &lock->val); > > - __atomic_acquire_fence(); > > + u32 new = _Q_LOCKED_VAL; > > + u32 prev; > > + > > + asm volatile( > > +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" > > +" or %0,%0,%2 \n" > > +" stwcx. %0,0,%1 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " \n" > > + : "=&r" (prev) > > + : "r" (&lock->val), "r" (new), > > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > + : "cr0", "memory"); > > } > > This is pretty similar with the DEFINE_TESTOP() pattern from > arch/powerpc/include/asm/bitops.h (such as test_and_set_bits_lock()) except for > word instead of double word. Do you think it's possible / beneficial to make > use of those macros? If we could pull almost all our atomic primitives into one place and make them usable by atomics, bitops, locks, etc. might be a good idea. That function specifically works on a dword so we can't use it here, and I don't want to modify any files except for the new ones in this series if possible, but consolidating our primitives a bit more would be nice. > > -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ > > -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val) > > +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ > > +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) > > { > > - int newval = _Q_LOCKED_VAL; > > - > > - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) > > + u32 new = _Q_LOCKED_VAL; > > + u32 prev; > > + > > + BUG_ON(old & _Q_LOCKED_VAL); > > The BUG_ON() could have been introduced in an earlier patch I think. Yes. > > + > > + asm volatile( > > +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" > > +" cmpw 0,%0,%2 \n" > > +" bne- 2f \n" > > +" stwcx. %3,0,%1 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " \n" > > +"2: \n" > > + : "=&r" (prev) > > + : "r" (&lock->val), "r"(old), "r" (new), > > Could this be like "r"(_Q_TAIL_CPU_MASK) below? > i.e. "r" (_Q_LOCKED_VAL)? Makes it clear new doesn't change. Sure. > > > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > > + : "cr0", "memory"); > > + > > + if (likely(prev == old)) > > return 1; > > - else > > - return 0; > > + return 0; > > } > > > > /* > > @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va > > * This provides a release barrier for publishing node, and an acquire barrier > > Does the comment mean there needs to be an acquire barrier in this assembly? Yes, another good catch. What I'm going to do instead is add the acquire to get_tail_qnode() because that path is only hit when you have multiple waiters, and I think pairing it that way makes the barriers more obvious. Thanks, Nick
On Thu Nov 10, 2022 at 6:36 PM AEST, Christophe Leroy wrote: > > > Le 10/11/2022 à 01:39, Jordan Niethe a écrit : > >> +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) > >> { > >> - int newval = _Q_LOCKED_VAL; > >> - > >> - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) > >> + u32 new = _Q_LOCKED_VAL; > >> + u32 prev; > >> + > >> + BUG_ON(old & _Q_LOCKED_VAL); > > > > The BUG_ON() could have been introduced in an earlier patch I think. > > Can we avoid the BUG_ON() at all and replace by a WARN_ON ? Lock has gone wrong here. Critical sections not working means data corruption and little prospect of continuing to run. Thanks, Nick
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 79a1936fb68d..3ab354159e5e 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -2,28 +2,43 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <linux/atomic.h> #include <linux/compiler.h> #include <asm/qspinlock_types.h> static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - return atomic_read(&lock->val); + return READ_ONCE(lock->val); } static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { - return !atomic_read(&lock.val); + return !lock.val; } static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } static __always_inline int queued_spin_trylock(struct qspinlock *lock) { - if (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0) + u32 new = _Q_LOCKED_VAL; + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" +" cmpwi 0,%0,0 \n" +" bne- 2f \n" +" stwcx. %2,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); + + if (likely(prev == 0)) return 1; return 0; } diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index 3425dab42576..210adf05b235 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -7,7 +7,7 @@ typedef struct qspinlock { union { - atomic_t val; + u32 val; #ifdef __LITTLE_ENDIAN struct { @@ -23,10 +23,10 @@ typedef struct qspinlock { }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } /* - * Bitfields in the atomic value: + * Bitfields in the lock word: * * 0: locked bit * 16-31: tail cpu (+1) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 5ebb88d95636..7c71e5e287df 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/export.h> @@ -22,32 +21,59 @@ struct qnodes { static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); -static inline int encode_tail_cpu(void) +static inline u32 encode_tail_cpu(void) { return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET; } -static inline int get_tail_cpu(int val) +static inline int get_tail_cpu(u32 val) { return (val >> _Q_TAIL_CPU_OFFSET) - 1; } /* Take the lock by setting the bit, no other CPUs may concurrently lock it. */ +/* Take the lock by setting the lock bit, no other CPUs will touch it. */ static __always_inline void lock_set_locked(struct qspinlock *lock) { - atomic_or(_Q_LOCKED_VAL, &lock->val); - __atomic_acquire_fence(); + u32 new = _Q_LOCKED_VAL; + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1,%3 # lock_set_locked \n" +" or %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); } -/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */ -static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int val) +/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */ +static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 old) { - int newval = _Q_LOCKED_VAL; - - if (atomic_cmpxchg_acquire(&lock->val, val, newval) == val) + u32 new = _Q_LOCKED_VAL; + u32 prev; + + BUG_ON(old & _Q_LOCKED_VAL); + + asm volatile( +"1: lwarx %0,0,%1,%4 # trylock_clear_tail_cpu \n" +" cmpw 0,%0,%2 \n" +" bne- 2f \n" +" stwcx. %3,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r"(old), "r" (new), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); + + if (likely(prev == old)) return 1; - else - return 0; + return 0; } /* @@ -56,20 +82,25 @@ static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int va * This provides a release barrier for publishing node, and an acquire barrier * for getting the old node. */ -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) { - for (;;) { - int val = atomic_read(&lock->val); - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; - int old; - - old = atomic_cmpxchg(&lock->val, val, newval); - if (old == val) - return old; - } + u32 prev, tmp; + + asm volatile( +"\t" PPC_RELEASE_BARRIER " \n" +"1: lwarx %0,0,%2 # publish_tail_cpu \n" +" andc %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" + : "=&r" (prev), "=&r"(tmp) + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) + : "cr0", "memory"); + + return prev; } -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = get_tail_cpu(val); struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); @@ -88,7 +119,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) { struct qnodes *qnodesp; struct qnode *next, *node; - int val, old, tail; + u32 val, old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -134,7 +165,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) } /* We're at the head of the waitqueue, wait for the lock. */ - while ((val = atomic_read(&lock->val)) & _Q_LOCKED_VAL) + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) cpu_relax(); /* If we're the last queued, must clean up the tail. */