Message ID | 1426157723-28548-1-git-send-email-haokexin@gmail.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Thu, 2015-03-12 at 18:55 +0800, Kevin Hao wrote: > I know Torsten Duwe has tried to add the ticket spinlock for powerpc > one year ago [1]. But it make no progress due to the conflict between > PPC_SPLPAR and lockref. We still don't find a better way to handle > this. But instead of waiting forever for a perfect solution, can't we > just use the ticket spinlock for the !CONFIG_PPC_SPLPAR? > > This is a very rough patch based on arm64 codes. I want to make sure > that this is acceptable before going step further. This just passed > build and boot test on a fsl t4240rdb board. I have done a simple > performance benchmark by running the following command ten times before > and after applying this patch: > ./perf bench sched messaging > > Before After > Averaged total time [sec]: 0.403 0.367 > > So we can see a ~9% performance enhancing. This patch depends on this > one [2]. I would do the ifdef'ing differently, something like CONFIG_PPC_HAS_LOCK_OWNER CONFIG_PPC_TICKET_LOCKS depends on !PPC_HAS_LOCK_OWNER and use these two in the code... with SPLPAR select'ing HAS_LOCK_OWNER > [1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-February/115195.html > [2] http://patchwork.ozlabs.org/patch/447563/ > > Signed-off-by: Kevin Hao <haokexin@gmail.com> > --- > arch/powerpc/include/asm/spinlock.h | 79 ++++++++++++++++++++++++++++++- > arch/powerpc/include/asm/spinlock_types.h | 16 +++++++ > arch/powerpc/lib/locks.c | 2 +- > 3 files changed, 95 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h > index d303cdad2519..3faf2507abe9 100644 > --- a/arch/powerpc/include/asm/spinlock.h > +++ b/arch/powerpc/include/asm/spinlock.h > @@ -54,6 +54,7 @@ > #define SYNC_IO > #endif > > +#ifdef CONFIG_PPC_SPLPAR > static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) > { > return lock.slock == 0; > @@ -89,6 +90,40 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) > return tmp; > } > > +#else > +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) > +{ > + return lock.owner == lock.next; > +} > + > +static inline int arch_spin_is_locked(arch_spinlock_t *lock) > +{ > + return !arch_spin_value_unlocked(READ_ONCE(*lock)); > +} > + > +static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) > +{ > + unsigned int tmp; > + arch_spinlock_t lockval; > + > + __asm__ __volatile__ ( > +"1: " PPC_LWARX(%0,0,%2,1) "\n\ > + rotlwi %1,%0,16\n\ > + xor. %1,%1,%0\n\ > + bne- 2f\n\ > + add %0,%0,%3\n\ > + stwcx. %0,0,%2\n\ > + bne- 1b\n" > + PPC_ACQUIRE_BARRIER > +"2:" > + : "=&r" (lockval), "=&r" (tmp) > + : "r" (lock), "r" (1 << TICKET_SHIFT) > + : "cr0", "memory"); > + > + return tmp; > +} > +#endif > + > static inline int arch_spin_trylock(arch_spinlock_t *lock) > { > CLEAR_IO_SYNC; > @@ -120,6 +155,7 @@ extern void __rw_yield(arch_rwlock_t *lock); > #define SHARED_PROCESSOR 0 > #endif > > +#ifdef CONFIG_PPC_SPLPAR > static inline void arch_spin_lock(arch_spinlock_t *lock) > { > CLEAR_IO_SYNC; > @@ -155,16 +191,57 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) > local_irq_restore(flags_dis); > } > } > +#else > +#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) > + > +static inline int arch_spin_is_contended(arch_spinlock_t *lock) > +{ > + arch_spinlock_t lockval = READ_ONCE(*lock); > + return (lockval.next - lockval.owner) > 1; > +} > +#define arch_spin_is_contended arch_spin_is_contended > + > +static inline void arch_spin_lock(arch_spinlock_t *lock) > +{ > + unsigned int tmp; > + arch_spinlock_t lockval; > + > + CLEAR_IO_SYNC; > + __asm__ __volatile__ ( > +"1: " PPC_LWARX(%0,0,%2,1) "\n\ > + add %1,%0,%4\n\ > + stwcx. %1,0,%2\n\ > + bne- 1b\n\ > + rotlwi %1,%0,16\n\ > + cmpw %1,%0\n\ > + beq 3f\n\ > + rlwinm %0,%0,16,16,31\n\ > +2: or 1,1,1\n\ > + lhz %1,0(%3)\n\ > + cmpw %1,%0\n\ > + bne 2b\n\ > + or 2,2,2\n\ > +3:" > + PPC_ACQUIRE_BARRIER > + : "=&r" (lockval), "=&r" (tmp) > + : "r"(lock), "r" (&lock->owner), "r" (1 << TICKET_SHIFT) > + : "cr0", "memory"); > +} > +#endif > > static inline void arch_spin_unlock(arch_spinlock_t *lock) > { > SYNC_IO; > __asm__ __volatile__("# arch_spin_unlock\n\t" > PPC_RELEASE_BARRIER: : :"memory"); > +#ifdef CONFIG_PPC_SPLPAR > lock->slock = 0; > +#else > + lock->owner++; > +#endif > } > > -#ifdef CONFIG_PPC64 > +#ifdef CONFIG_PPC_SPLPAR > extern void arch_spin_unlock_wait(arch_spinlock_t *lock); > #else > #define arch_spin_unlock_wait(lock) \ > diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h > index 2351adc4fdc4..1af94f290363 100644 > --- a/arch/powerpc/include/asm/spinlock_types.h > +++ b/arch/powerpc/include/asm/spinlock_types.h > @@ -5,11 +5,27 @@ > # error "please don't include this file directly" > #endif > > +#ifdef CONFIG_PPC_SPLPAR > typedef struct { > volatile unsigned int slock; > } arch_spinlock_t; > > #define __ARCH_SPIN_LOCK_UNLOCKED { 0 } > +#else > +#define TICKET_SHIFT 16 > + > +typedef struct { > +#ifdef __BIG_ENDIAN__ > + u16 next; > + u16 owner; > +#else > + u16 owner; > + u16 next; > +#endif > +} __aligned(4) arch_spinlock_t; > + > +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 , 0 } > +#endif /*CONFIG_PPC_SPLPAR*/ > > typedef struct { > volatile signed int lock; > diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c > index 170a0346f756..fe3d21eeb10d 100644 > --- a/arch/powerpc/lib/locks.c > +++ b/arch/powerpc/lib/locks.c > @@ -66,7 +66,6 @@ void __rw_yield(arch_rwlock_t *rw) > plpar_hcall_norets(H_CONFER, > get_hard_smp_processor_id(holder_cpu), yield_count); > } > -#endif > > void arch_spin_unlock_wait(arch_spinlock_t *lock) > { > @@ -83,3 +82,4 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock) > } > > EXPORT_SYMBOL(arch_spin_unlock_wait); > +#endif
On Thu, Mar 12, 2015 at 10:13:27PM +1100, Benjamin Herrenschmidt wrote: > On Thu, 2015-03-12 at 18:55 +0800, Kevin Hao wrote: > > I know Torsten Duwe has tried to add the ticket spinlock for powerpc > > one year ago [1]. But it make no progress due to the conflict between OMG, time flies. > > PPC_SPLPAR and lockref. We still don't find a better way to handle > > this. But instead of waiting forever for a perfect solution, can't we > > just use the ticket spinlock for the !CONFIG_PPC_SPLPAR? I was actually thinking about squeezing it all (incl. lockref) into 64 bits, or making it a bit larger, keeping the holder outside the cache line, or ... Then priorities shifted. > I would do the ifdef'ing differently, something like > > CONFIG_PPC_HAS_LOCK_OWNER > > CONFIG_PPC_TICKET_LOCKS depends on !PPC_HAS_LOCK_OWNER > > and use these two in the code... with SPLPAR select'ing HAS_LOCK_OWNER Or avoid ifdef'ing and give pseries its own, platform-specific lock implementation? It's more work in the build framework but clearer in the C source. Just a though. But generally, which platforms would benefit most from this change? I must admit my access to hardware variety is somewhat limited ;-) Torsten
On Thu, Mar 12, 2015 at 10:13:27PM +1100, Benjamin Herrenschmidt wrote: > I would do the ifdef'ing differently, something like > > CONFIG_PPC_HAS_LOCK_OWNER > > CONFIG_PPC_TICKET_LOCKS depends on !PPC_HAS_LOCK_OWNER > > and use these two in the code... with SPLPAR select'ing HAS_LOCK_OWNER OK, will do. Thanks, Kevin
On Thu, Mar 12, 2015 at 04:24:10PM +0100, Torsten Duwe wrote:
> But generally, which platforms would benefit most from this change?
In theory, the more cpus the platform has, the more serious the thundering
herd problem is. So the latest platforms which has more cpus would benefit
most.
Thanks,
Keivn
On Fri, 2015-03-13 at 14:09 +0800, Kevin Hao wrote: > On Thu, Mar 12, 2015 at 04:24:10PM +0100, Torsten Duwe wrote: > > But generally, which platforms would benefit most from this change? > > In theory, the more cpus the platform has, the more serious the thundering > herd problem is. So the latest platforms which has more cpus would benefit > most. In fact, pseries would be one to benefit ... it would be nice to find a way to sort out the lock owner problem. One idea I had was to keep the owners elsewhere, some kind of hash chain of "taken locks" hashed on the lock address. There shouldn't be *that* many locks held at a given point in time. Trying to keep the owner in a separate & non-atomic part of the lock proved being too hard due to how the lockref and generic code are interwined, though maybe somebody smarter can find a trick :-) Cheers, Ben.
On Thu, 2015-03-12 at 22:13 +1100, Benjamin Herrenschmidt wrote: > On Thu, 2015-03-12 at 18:55 +0800, Kevin Hao wrote: > > I know Torsten Duwe has tried to add the ticket spinlock for powerpc > > one year ago [1]. But it make no progress due to the conflict between > > PPC_SPLPAR and lockref. We still don't find a better way to handle > > this. But instead of waiting forever for a perfect solution, can't we > > just use the ticket spinlock for the !CONFIG_PPC_SPLPAR? > > > > This is a very rough patch based on arm64 codes. I want to make sure > > that this is acceptable before going step further. This just passed > > build and boot test on a fsl t4240rdb board. I have done a simple > > performance benchmark by running the following command ten times before > > and after applying this patch: > > ./perf bench sched messaging > > > > Before After > > Averaged total time [sec]: 0.403 0.367 > > > > So we can see a ~9% performance enhancing. This patch depends on this > > one [2]. > > I would do the ifdef'ing differently, something like > > CONFIG_PPC_HAS_LOCK_OWNER > > CONFIG_PPC_TICKET_LOCKS depends on !PPC_HAS_LOCK_OWNER > > and use these two in the code... with SPLPAR select'ing HAS_LOCK_OWNER Sam was doing some work looking at CONFER, and I think so far he hasn't found that it is much of a benefit. Hopefully he can chime in with his observations. So the question is, should we just drop the directed CONFER and switch wholesale to ticket locks? We can still do CONFER on SPLPAR, we just tell the hypervisor we don't know who to confer to. There is still the drawback that we loose the lock owner for debugging, but that might be worth it. And I think you can get it back with appropriate debug options? cheers
On Fri, 2015-03-13 at 18:09 +1100, Michael Ellerman wrote: > On Thu, 2015-03-12 at 22:13 +1100, Benjamin Herrenschmidt wrote: > > On Thu, 2015-03-12 at 18:55 +0800, Kevin Hao wrote: > > > I know Torsten Duwe has tried to add the ticket spinlock for powerpc > > > one year ago [1]. But it make no progress due to the conflict between > > > PPC_SPLPAR and lockref. We still don't find a better way to handle > > > this. But instead of waiting forever for a perfect solution, can't we > > > just use the ticket spinlock for the !CONFIG_PPC_SPLPAR? > > > > > > This is a very rough patch based on arm64 codes. I want to make sure > > > that this is acceptable before going step further. This just passed > > > build and boot test on a fsl t4240rdb board. I have done a simple > > > performance benchmark by running the following command ten times before > > > and after applying this patch: > > > ./perf bench sched messaging > > > > > > Before After > > > Averaged total time [sec]: 0.403 0.367 > > > > > > So we can see a ~9% performance enhancing. This patch depends on this > > > one [2]. > > > > I would do the ifdef'ing differently, something like > > > > CONFIG_PPC_HAS_LOCK_OWNER > > > > CONFIG_PPC_TICKET_LOCKS depends on !PPC_HAS_LOCK_OWNER > > > > and use these two in the code... with SPLPAR select'ing HAS_LOCK_OWNER > > > Sam was doing some work looking at CONFER, and I think so far he hasn't found > that it is much of a benefit. Hopefully he can chime in with his observations. > > So the question is, should we just drop the directed CONFER and switch > wholesale to ticket locks? > > We can still do CONFER on SPLPAR, we just tell the hypervisor we don't know who > to confer to. > > There is still the drawback that we loose the lock owner for debugging, but > that might be worth it. And I think you can get it back with appropriate debug > options? Another possibility would be to change the order of the count and the lock in lockref and defined it to be packed. That might allow us to have our lock contain the ticket first and the count next and "fit" ... > cheers >
On 13/03/15 18:14, Benjamin Herrenschmidt wrote: > On Fri, 2015-03-13 at 18:09 +1100, Michael Ellerman wrote: >> On Thu, 2015-03-12 at 22:13 +1100, Benjamin Herrenschmidt wrote: >>> On Thu, 2015-03-12 at 18:55 +0800, Kevin Hao wrote: >>>> I know Torsten Duwe has tried to add the ticket spinlock for powerpc >>>> one year ago [1]. But it make no progress due to the conflict between >>>> PPC_SPLPAR and lockref. We still don't find a better way to handle >>>> this. But instead of waiting forever for a perfect solution, can't we >>>> just use the ticket spinlock for the !CONFIG_PPC_SPLPAR? >>>> >>>> This is a very rough patch based on arm64 codes. I want to make sure >>>> that this is acceptable before going step further. This just passed >>>> build and boot test on a fsl t4240rdb board. I have done a simple >>>> performance benchmark by running the following command ten times before >>>> and after applying this patch: >>>> ./perf bench sched messaging >>>> >>>> Before After >>>> Averaged total time [sec]: 0.403 0.367 >>>> >>>> So we can see a ~9% performance enhancing. This patch depends on this >>>> one [2]. >>> >>> I would do the ifdef'ing differently, something like >>> >>> CONFIG_PPC_HAS_LOCK_OWNER >>> >>> CONFIG_PPC_TICKET_LOCKS depends on !PPC_HAS_LOCK_OWNER >>> >>> and use these two in the code... with SPLPAR select'ing HAS_LOCK_OWNER >> >> >> Sam was doing some work looking at CONFER, and I think so far he hasn't found >> that it is much of a benefit. Hopefully he can chime in with his observations. >> >> So the question is, should we just drop the directed CONFER and switch >> wholesale to ticket locks? Confer certainly helps in some situations, although I think it will be strongly workload and configuration dependent and I haven't tried to do much realistic testing. Also, being able to use confer depends on both knowing the lock owner and also that the lock owner's VCPU is not currently running (which is done via the yield count, which is also part of the lock token at the moment). With just the yield count we could still use an undirected confer (which might be almost as good as a directed one), but without it we would have to drop it entirely. >> We can still do CONFER on SPLPAR, we just tell the hypervisor we don't know who >> to confer to. >> >> There is still the drawback that we loose the lock owner for debugging, but >> that might be worth it. And I think you can get it back with appropriate debug >> options? > > Another possibility would be to change the order of the count and the > lock in lockref and defined it to be packed. That might allow us to have > our lock contain the ticket first and the count next and "fit" ... > >> cheers >> > > > _______________________________________________ > Linuxppc-dev mailing list > Linuxppc-dev@lists.ozlabs.org > https://lists.ozlabs.org/listinfo/linuxppc-dev >
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index d303cdad2519..3faf2507abe9 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -54,6 +54,7 @@ #define SYNC_IO #endif +#ifdef CONFIG_PPC_SPLPAR static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.slock == 0; @@ -89,6 +90,40 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) return tmp; } +#else +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.owner == lock.next; +} + +static inline int arch_spin_is_locked(arch_spinlock_t *lock) +{ + return !arch_spin_value_unlocked(READ_ONCE(*lock)); +} + +static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) +{ + unsigned int tmp; + arch_spinlock_t lockval; + + __asm__ __volatile__ ( +"1: " PPC_LWARX(%0,0,%2,1) "\n\ + rotlwi %1,%0,16\n\ + xor. %1,%1,%0\n\ + bne- 2f\n\ + add %0,%0,%3\n\ + stwcx. %0,0,%2\n\ + bne- 1b\n" + PPC_ACQUIRE_BARRIER +"2:" + : "=&r" (lockval), "=&r" (tmp) + : "r" (lock), "r" (1 << TICKET_SHIFT) + : "cr0", "memory"); + + return tmp; +} +#endif + static inline int arch_spin_trylock(arch_spinlock_t *lock) { CLEAR_IO_SYNC; @@ -120,6 +155,7 @@ extern void __rw_yield(arch_rwlock_t *lock); #define SHARED_PROCESSOR 0 #endif +#ifdef CONFIG_PPC_SPLPAR static inline void arch_spin_lock(arch_spinlock_t *lock) { CLEAR_IO_SYNC; @@ -155,16 +191,57 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) local_irq_restore(flags_dis); } } +#else +#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) + +static inline int arch_spin_is_contended(arch_spinlock_t *lock) +{ + arch_spinlock_t lockval = READ_ONCE(*lock); + return (lockval.next - lockval.owner) > 1; +} +#define arch_spin_is_contended arch_spin_is_contended + +static inline void arch_spin_lock(arch_spinlock_t *lock) +{ + unsigned int tmp; + arch_spinlock_t lockval; + + CLEAR_IO_SYNC; + __asm__ __volatile__ ( +"1: " PPC_LWARX(%0,0,%2,1) "\n\ + add %1,%0,%4\n\ + stwcx. %1,0,%2\n\ + bne- 1b\n\ + rotlwi %1,%0,16\n\ + cmpw %1,%0\n\ + beq 3f\n\ + rlwinm %0,%0,16,16,31\n\ +2: or 1,1,1\n\ + lhz %1,0(%3)\n\ + cmpw %1,%0\n\ + bne 2b\n\ + or 2,2,2\n\ +3:" + PPC_ACQUIRE_BARRIER + : "=&r" (lockval), "=&r" (tmp) + : "r"(lock), "r" (&lock->owner), "r" (1 << TICKET_SHIFT) + : "cr0", "memory"); +} +#endif static inline void arch_spin_unlock(arch_spinlock_t *lock) { SYNC_IO; __asm__ __volatile__("# arch_spin_unlock\n\t" PPC_RELEASE_BARRIER: : :"memory"); +#ifdef CONFIG_PPC_SPLPAR lock->slock = 0; +#else + lock->owner++; +#endif } -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_SPLPAR extern void arch_spin_unlock_wait(arch_spinlock_t *lock); #else #define arch_spin_unlock_wait(lock) \ diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index 2351adc4fdc4..1af94f290363 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -5,11 +5,27 @@ # error "please don't include this file directly" #endif +#ifdef CONFIG_PPC_SPLPAR typedef struct { volatile unsigned int slock; } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { 0 } +#else +#define TICKET_SHIFT 16 + +typedef struct { +#ifdef __BIG_ENDIAN__ + u16 next; + u16 owner; +#else + u16 owner; + u16 next; +#endif +} __aligned(4) arch_spinlock_t; + +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 , 0 } +#endif /*CONFIG_PPC_SPLPAR*/ typedef struct { volatile signed int lock; diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 170a0346f756..fe3d21eeb10d 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -66,7 +66,6 @@ void __rw_yield(arch_rwlock_t *rw) plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(holder_cpu), yield_count); } -#endif void arch_spin_unlock_wait(arch_spinlock_t *lock) { @@ -83,3 +82,4 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_unlock_wait); +#endif
I know Torsten Duwe has tried to add the ticket spinlock for powerpc one year ago [1]. But it make no progress due to the conflict between PPC_SPLPAR and lockref. We still don't find a better way to handle this. But instead of waiting forever for a perfect solution, can't we just use the ticket spinlock for the !CONFIG_PPC_SPLPAR? This is a very rough patch based on arm64 codes. I want to make sure that this is acceptable before going step further. This just passed build and boot test on a fsl t4240rdb board. I have done a simple performance benchmark by running the following command ten times before and after applying this patch: ./perf bench sched messaging Before After Averaged total time [sec]: 0.403 0.367 So we can see a ~9% performance enhancing. This patch depends on this one [2]. [1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-February/115195.html [2] http://patchwork.ozlabs.org/patch/447563/ Signed-off-by: Kevin Hao <haokexin@gmail.com> --- arch/powerpc/include/asm/spinlock.h | 79 ++++++++++++++++++++++++++++++- arch/powerpc/include/asm/spinlock_types.h | 16 +++++++ arch/powerpc/lib/locks.c | 2 +- 3 files changed, 95 insertions(+), 2 deletions(-)