Message ID | 20220728063120.2867508-15-npiggin@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | powerpc: alternate queued spinlock implementation | expand |
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > This gives trylock slightly more strength, and it also gives most > of the benefit of passing 'val' back through the slowpath without > the complexity. > --- > arch/powerpc/include/asm/qspinlock.h | 39 +++++++++++++++++++++++++++- > arch/powerpc/lib/qspinlock.c | 9 +++++++ > 2 files changed, 47 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h > index 44601b261e08..d3d2039237b2 100644 > --- a/arch/powerpc/include/asm/qspinlock.h > +++ b/arch/powerpc/include/asm/qspinlock.h > @@ -5,6 +5,8 @@ > #include <linux/compiler.h> > #include <asm/qspinlock_types.h> > > +#define _Q_SPIN_TRY_LOCK_STEAL 1 Would this be a config option? > + > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > { > return READ_ONCE(lock->val); > @@ -26,11 +28,12 @@ static __always_inline u32 queued_spin_get_locked_val(void) > return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); > } > > -static __always_inline int queued_spin_trylock(struct qspinlock *lock) > +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) > { > u32 new = queued_spin_get_locked_val(); > u32 prev; > > + /* Trylock succeeds only when unlocked and no queued nodes */ > asm volatile( > "1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" s/queued_spin_trylock/__queued_spin_trylock_nosteal > " cmpwi 0,%0,0 \n" > @@ -49,6 +52,40 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) > return 0; > } > > +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) > +{ > + u32 new = queued_spin_get_locked_val(); > + u32 prev, tmp; > + > + /* Trylock may get ahead of queued nodes if it finds unlocked */ > + asm volatile( > +"1: lwarx %0,0,%2,%5 # queued_spin_trylock \n" s/queued_spin_trylock/__queued_spin_trylock_steal > +" andc. %1,%0,%4 \n" > +" bne- 2f \n" > +" and %1,%0,%4 \n" > +" or %1,%1,%3 \n" > +" stwcx. %1,0,%2 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" Just because there's a little bit more going on here... Q_TAIL_CPU_MASK = 0xFFFE0000 ~Q_TAIL_CPU_MASK = 0x1FFFF 1: lwarx prev, 0, &lock->val, IS_ENABLED_PPC64 andc. tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & ~_Q_TAIL_CPU_MASK) bne- 2f (exit if locked) and tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & _Q_TAIL_CPU_MASK) or tmp, tmp, new (tmp |= new) stwcx. tmp, 0, &lock->val bne- 1b PPC_ACQUIRE_BARRIER 2: ... which seems correct. > + : "=&r" (prev), "=&r" (tmp) > + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > + > + if (likely(!(prev & ~_Q_TAIL_CPU_MASK))) > + return 1; > + return 0; > +} > + > +static __always_inline int queued_spin_trylock(struct qspinlock *lock) > +{ > + if (!_Q_SPIN_TRY_LOCK_STEAL) > + return __queued_spin_trylock_nosteal(lock); > + else > + return __queued_spin_trylock_steal(lock); > +} > + > void queued_spin_lock_slowpath(struct qspinlock *lock); > > static __always_inline void queued_spin_lock(struct qspinlock *lock) > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > index 3b10e31bcf0a..277aef1fab0a 100644 > --- a/arch/powerpc/lib/qspinlock.c > +++ b/arch/powerpc/lib/qspinlock.c > @@ -24,7 +24,11 @@ struct qnodes { > > /* Tuning parameters */ > static int STEAL_SPINS __read_mostly = (1<<5); > +#if _Q_SPIN_TRY_LOCK_STEAL == 1 > +static const bool MAYBE_STEALERS = true; > +#else > static bool MAYBE_STEALERS __read_mostly = true; > +#endif > static int HEAD_SPINS __read_mostly = (1<<8); > > static bool pv_yield_owner __read_mostly = true; > @@ -522,6 +526,10 @@ void pv_spinlocks_init(void) > #include <linux/debugfs.h> > static int steal_spins_set(void *data, u64 val) > { > +#if _Q_SPIN_TRY_LOCK_STEAL == 1 > + /* MAYBE_STEAL remains true */ > + STEAL_SPINS = val; > +#else > static DEFINE_MUTEX(lock); > > mutex_lock(&lock); > @@ -539,6 +547,7 @@ static int steal_spins_set(void *data, u64 val) > STEAL_SPINS = val; > } > mutex_unlock(&lock); > +#endif > > return 0; > }
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: [resend as utf-8, not utf-7] > This gives trylock slightly more strength, and it also gives most > of the benefit of passing 'val' back through the slowpath without > the complexity. > --- > arch/powerpc/include/asm/qspinlock.h | 39 +++++++++++++++++++++++++++- > arch/powerpc/lib/qspinlock.c | 9 +++++++ > 2 files changed, 47 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h > index 44601b261e08..d3d2039237b2 100644 > --- a/arch/powerpc/include/asm/qspinlock.h > +++ b/arch/powerpc/include/asm/qspinlock.h > @@ -5,6 +5,8 @@ > #include <linux/compiler.h> > #include <asm/qspinlock_types.h> > > +#define _Q_SPIN_TRY_LOCK_STEAL 1 Would this be a config option? > + > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > { > return READ_ONCE(lock->val); > @@ -26,11 +28,12 @@ static __always_inline u32 queued_spin_get_locked_val(void) > return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); > } > > -static __always_inline int queued_spin_trylock(struct qspinlock *lock) > +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) > { > u32 new = queued_spin_get_locked_val(); > u32 prev; > > + /* Trylock succeeds only when unlocked and no queued nodes */ > asm volatile( > "1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" s/queued_spin_trylock/__queued_spin_trylock_nosteal > " cmpwi 0,%0,0 \n" > @@ -49,6 +52,40 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) > return 0; > } > > +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) > +{ > + u32 new = queued_spin_get_locked_val(); > + u32 prev, tmp; > + > + /* Trylock may get ahead of queued nodes if it finds unlocked */ > + asm volatile( > +"1: lwarx %0,0,%2,%5 # queued_spin_trylock \n" s/queued_spin_trylock/__queued_spin_trylock_steal > +" andc. %1,%0,%4 \n" > +" bne- 2f \n" > +" and %1,%0,%4 \n" > +" or %1,%1,%3 \n" > +" stwcx. %1,0,%2 \n" > +" bne- 1b \n" > +"\t" PPC_ACQUIRE_BARRIER " \n" > +"2: \n" Just because there's a little bit more going on here... Q_TAIL_CPU_MASK = 0xFFFE0000 ~Q_TAIL_CPU_MASK = 0x1FFFF 1: lwarx prev, 0, &lock->val, IS_ENABLED_PPC64 andc. tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & ~_Q_TAIL_CPU_MASK) bne- 2f (exit if locked) and tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & _Q_TAIL_CPU_MASK) or tmp, tmp, new (tmp |= new) stwcx. tmp, 0, &lock->val bne- 1b PPC_ACQUIRE_BARRIER 2: ... which seems correct. > + : "=&r" (prev), "=&r" (tmp) > + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), > + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) > + : "cr0", "memory"); > + > + if (likely(!(prev & ~_Q_TAIL_CPU_MASK))) > + return 1; > + return 0; > +} > + > +static __always_inline int queued_spin_trylock(struct qspinlock *lock) > +{ > + if (!_Q_SPIN_TRY_LOCK_STEAL) > + return __queued_spin_trylock_nosteal(lock); > + else > + return __queued_spin_trylock_steal(lock); > +} > + > void queued_spin_lock_slowpath(struct qspinlock *lock); > > static __always_inline void queued_spin_lock(struct qspinlock *lock) > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c > index 3b10e31bcf0a..277aef1fab0a 100644 > --- a/arch/powerpc/lib/qspinlock.c > +++ b/arch/powerpc/lib/qspinlock.c > @@ -24,7 +24,11 @@ struct qnodes { > > /* Tuning parameters */ > static int STEAL_SPINS __read_mostly = (1<<5); > +#if _Q_SPIN_TRY_LOCK_STEAL == 1 > +static const bool MAYBE_STEALERS = true; > +#else > static bool MAYBE_STEALERS __read_mostly = true; > +#endif > static int HEAD_SPINS __read_mostly = (1<<8); > > static bool pv_yield_owner __read_mostly = true; > @@ -522,6 +526,10 @@ void pv_spinlocks_init(void) > #include <linux/debugfs.h> > static int steal_spins_set(void *data, u64 val) > { > +#if _Q_SPIN_TRY_LOCK_STEAL == 1 > + /* MAYBE_STEAL remains true */ > + STEAL_SPINS = val; > +#else > static DEFINE_MUTEX(lock); > > mutex_lock(&lock); > @@ -539,6 +547,7 @@ static int steal_spins_set(void *data, u64 val) > STEAL_SPINS = val; > } > mutex_unlock(&lock); > +#endif > > return 0; > }
On Thu Nov 10, 2022 at 10:43 AM AEST, Jordan Niethe wrote: > On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote: > [resend as utf-8, not utf-7] > > This gives trylock slightly more strength, and it also gives most > > of the benefit of passing 'val' back through the slowpath without > > the complexity. > > --- > > arch/powerpc/include/asm/qspinlock.h | 39 +++++++++++++++++++++++++++- > > arch/powerpc/lib/qspinlock.c | 9 +++++++ > > 2 files changed, 47 insertions(+), 1 deletion(-) > > > > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h > > index 44601b261e08..d3d2039237b2 100644 > > --- a/arch/powerpc/include/asm/qspinlock.h > > +++ b/arch/powerpc/include/asm/qspinlock.h > > @@ -5,6 +5,8 @@ > > #include <linux/compiler.h> > > #include <asm/qspinlock_types.h> > > > > +#define _Q_SPIN_TRY_LOCK_STEAL 1 > > Would this be a config option? I think probably not, it's more to keep the other code variant there if we want to try tune experiment with it. We might end up cutting out a bunch of these options if we narrow down on a good configuration. > > > + > > static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > > { > > return READ_ONCE(lock->val); > > @@ -26,11 +28,12 @@ static __always_inline u32 queued_spin_get_locked_val(void) > > return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); > > } > > > > -static __always_inline int queued_spin_trylock(struct qspinlock *lock) > > +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) > > { > > u32 new = queued_spin_get_locked_val(); > > u32 prev; > > > > + /* Trylock succeeds only when unlocked and no queued nodes */ > > asm volatile( > > "1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" > > s/queued_spin_trylock/__queued_spin_trylock_nosteal I wanted to keep those because they (can be) inlined into the wider kernel, so you'd rather see queued_spin_trylock than this internal name. > > " cmpwi 0,%0,0 \n" > > @@ -49,6 +52,40 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) > > return 0; > > } > > > > +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) > > +{ > > + u32 new = queued_spin_get_locked_val(); > > + u32 prev, tmp; > > + > > + /* Trylock may get ahead of queued nodes if it finds unlocked */ > > + asm volatile( > > +"1: lwarx %0,0,%2,%5 # queued_spin_trylock \n" > > s/queued_spin_trylock/__queued_spin_trylock_steal > > > +" andc. %1,%0,%4 \n" > > +" bne- 2f \n" > > +" and %1,%0,%4 \n" > > +" or %1,%1,%3 \n" > > +" stwcx. %1,0,%2 \n" > > +" bne- 1b \n" > > +"\t" PPC_ACQUIRE_BARRIER " \n" > > +"2: \n" > > Just because there's a little bit more going on here... > > Q_TAIL_CPU_MASK = 0xFFFE0000 > ~Q_TAIL_CPU_MASK = 0x1FFFF > > > 1: lwarx prev, 0, &lock->val, IS_ENABLED_PPC64 > andc. tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & ~_Q_TAIL_CPU_MASK) > bne- 2f (exit if locked) > and tmp, prev, _Q_TAIL_CPU_MASK (tmp = prev & _Q_TAIL_CPU_MASK) > or tmp, tmp, new (tmp |= new) > stwcx. tmp, 0, &lock->val > > bne- 1b > PPC_ACQUIRE_BARRIER > 2: > > ... which seems correct. Thanks, Nick
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 44601b261e08..d3d2039237b2 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -5,6 +5,8 @@ #include <linux/compiler.h> #include <asm/qspinlock_types.h> +#define _Q_SPIN_TRY_LOCK_STEAL 1 + static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { return READ_ONCE(lock->val); @@ -26,11 +28,12 @@ static __always_inline u32 queued_spin_get_locked_val(void) return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); } -static __always_inline int queued_spin_trylock(struct qspinlock *lock) +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) { u32 new = queued_spin_get_locked_val(); u32 prev; + /* Trylock succeeds only when unlocked and no queued nodes */ asm volatile( "1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" " cmpwi 0,%0,0 \n" @@ -49,6 +52,40 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) return 0; } +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) +{ + u32 new = queued_spin_get_locked_val(); + u32 prev, tmp; + + /* Trylock may get ahead of queued nodes if it finds unlocked */ + asm volatile( +"1: lwarx %0,0,%2,%5 # queued_spin_trylock \n" +" andc. %1,%0,%4 \n" +" bne- 2f \n" +" and %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), + "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) + : "cr0", "memory"); + + if (likely(!(prev & ~_Q_TAIL_CPU_MASK))) + return 1; + return 0; +} + +static __always_inline int queued_spin_trylock(struct qspinlock *lock) +{ + if (!_Q_SPIN_TRY_LOCK_STEAL) + return __queued_spin_trylock_nosteal(lock); + else + return __queued_spin_trylock_steal(lock); +} + void queued_spin_lock_slowpath(struct qspinlock *lock); static __always_inline void queued_spin_lock(struct qspinlock *lock) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 3b10e31bcf0a..277aef1fab0a 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -24,7 +24,11 @@ struct qnodes { /* Tuning parameters */ static int STEAL_SPINS __read_mostly = (1<<5); +#if _Q_SPIN_TRY_LOCK_STEAL == 1 +static const bool MAYBE_STEALERS = true; +#else static bool MAYBE_STEALERS __read_mostly = true; +#endif static int HEAD_SPINS __read_mostly = (1<<8); static bool pv_yield_owner __read_mostly = true; @@ -522,6 +526,10 @@ void pv_spinlocks_init(void) #include <linux/debugfs.h> static int steal_spins_set(void *data, u64 val) { +#if _Q_SPIN_TRY_LOCK_STEAL == 1 + /* MAYBE_STEAL remains true */ + STEAL_SPINS = val; +#else static DEFINE_MUTEX(lock); mutex_lock(&lock); @@ -539,6 +547,7 @@ static int steal_spins_set(void *data, u64 val) STEAL_SPINS = val; } mutex_unlock(&lock); +#endif return 0; }