Message ID | 5343F8F1.4000400@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
Ping. On 08-04-2014 10:26, Adhemerval Zanella wrote: > This patch adds a single-thread optimization for libc.so locks used > within the shared objects. For each lock operations it checks it the > process has already spawned one thread and if not use non-atomic > operations. Other libraries (libpthread.so for instance) are unaffected > by this change. > > This is similar to x86_64 optimization on locks and atomics by using the > __libc_multiple_threads variable. > > Tested on powerpc32, powerpc64, and powerp64le. > > Note: for macro code change I tried to change as little as possible the > current syntax. > > -- > > * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > (__lll_robust_trylock): Add single-thread lock optimization for calls > within libc.so. > * sysdeps/powerpc/bits/atomic.h > (__arch_compare_and_exchange_val_32_acq): Likewise. > (__arch_compare_and_exchange_val_32_rel): Likewise. > (__arch_atomic_exchange_32_acq): Likewise. > (__arch_atomic_exchange_32_rel): Likewise. > (__arch_atomic_exchange_and_add_32): Likewise. > (__arch_atomic_increment_val_32): Likewise. > (__arch_atomic_decrement_val_32): Likewise. > (__arch_atomic_decrement_if_positive_32): Likewise. > * sysdeps/powerpc/powerpc32/bits/atomic.h > (__arch_compare_and_exchange_bool_32_acq): Likewise. > (__arch_compare_and_exchange_bool_32_rel): Likewise. > * sysdeps/powerpc/powerpc64/bits/atomic.h > (__arch_compare_and_exchange_bool_32_acq): Likewise. > (__arch_compare_and_exchange_bool_32_rel): Likewise. > (__arch_compare_and_exchange_bool_64_acq): Likewise. > (__arch_compare_and_exchange_bool_64_rel): Likewise. > (__arch_compare_and_exchange_val_64_acq): Likewise. > (__arch_compare_and_exchange_val_64_rel): Likewise. > (__arch_atomic_exchange_64_acq): Likewise. > (__arch_atomic_exchange_64_rel): Likewise. > (__arch_atomic_exchange_and_add_64): Likewise. > (__arch_atomic_increment_val_64): Likewise. > (__arch_atomic_decrement_val_64): Likewise. > (__arch_atomic_decrement_if_positive_64): Likewise. > > --- > > diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > index ab92c3f..419ee2f 100644 > --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > @@ -205,7 +205,9 @@ > /* Set *futex to ID if it is 0, atomically. Returns the old value */ > #define __lll_robust_trylock(futex, id) \ > ({ int __val; \ > - __asm __volatile ("1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > + "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > " cmpwi 0,%0,0\n" \ > " bne 2f\n" \ > " stwcx. %3,0,%2\n" \ > @@ -214,6 +216,12 @@ > : "=&r" (__val), "=m" (*futex) \ > : "r" (futex), "r" (id), "m" (*futex) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *futex; \ > + if (__val == 0) \ > + *futex = id; \ > + } \ > __val; \ > }) > > diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h > index 2ffba48..2d31411 100644 > --- a/sysdeps/powerpc/bits/atomic.h > +++ b/sysdeps/powerpc/bits/atomic.h > @@ -76,6 +76,10 @@ typedef uintmax_t uatomic_max_t; > # define MUTEX_HINT_REL > #endif > > +/* Note: SINGLE_THREAD_P is defined either in > + sysdeps/powerpc/powerpc64/bits/atomic.h or > + sysdeps/powerpc/powerpc32/bits/atomic.h */ > + > #define atomic_full_barrier() __asm ("sync" ::: "memory") > #define atomic_write_barrier() __asm ("eieio" ::: "memory") > > @@ -83,7 +87,8 @@ typedef uintmax_t uatomic_max_t; > ({ \ > __typeof (*(mem)) __tmp; \ > __typeof (mem) __memp = (mem); \ > - __asm __volatile ( \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > "1: lwarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ > " cmpw %0,%2\n" \ > " bne 2f\n" \ > @@ -93,6 +98,12 @@ typedef uintmax_t uatomic_max_t; > : "=&r" (__tmp) \ > : "b" (__memp), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = *__memp; \ > + if (__tmp == oldval) \ > + *__memp = newval; \ > + } \ > __tmp; \ > }) > > @@ -100,7 +111,8 @@ typedef uintmax_t uatomic_max_t; > ({ \ > __typeof (*(mem)) __tmp; \ > __typeof (mem) __memp = (mem); \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: lwarx %0,0,%1" MUTEX_HINT_REL "\n" \ > " cmpw %0,%2\n" \ > " bne 2f\n" \ > @@ -110,13 +122,20 @@ typedef uintmax_t uatomic_max_t; > : "=&r" (__tmp) \ > : "b" (__memp), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = *__memp; \ > + if (__tmp == oldval) \ > + *__memp = newval; \ > + } \ > __tmp; \ > }) > > #define __arch_atomic_exchange_32_acq(mem, value) \ > ({ \ > __typeof (*mem) __val; \ > - __asm __volatile ( \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > " stwcx. %3,0,%2\n" \ > " bne- 1b\n" \ > @@ -124,64 +143,92 @@ typedef uintmax_t uatomic_max_t; > : "=&r" (__val), "=m" (*mem) \ > : "b" (mem), "r" (value), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *mem; \ > + *mem = value; \ > + } \ > __val; \ > }) > > #define __arch_atomic_exchange_32_rel(mem, value) \ > ({ \ > __typeof (*mem) __val; \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: lwarx %0,0,%2" MUTEX_HINT_REL "\n" \ > " stwcx. %3,0,%2\n" \ > " bne- 1b" \ > : "=&r" (__val), "=m" (*mem) \ > : "b" (mem), "r" (value), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *mem; \ > + *mem = value; \ > + } \ > __val; \ > }) > > #define __arch_atomic_exchange_and_add_32(mem, value) \ > ({ \ > __typeof (*mem) __val, __tmp; \ > - __asm __volatile ("1: lwarx %0,0,%3\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > + "1: lwarx %0,0,%3\n" \ > " add %1,%0,%4\n" \ > " stwcx. %1,0,%3\n" \ > " bne- 1b" \ > : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ > : "b" (mem), "r" (value), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *mem; \ > + *mem += value; \ > + } \ > __val; \ > }) > > #define __arch_atomic_increment_val_32(mem) \ > ({ \ > __typeof (*(mem)) __val; \ > - __asm __volatile ("1: lwarx %0,0,%2\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > + "1: lwarx %0,0,%2\n" \ > " addi %0,%0,1\n" \ > " stwcx. %0,0,%2\n" \ > " bne- 1b" \ > : "=&b" (__val), "=m" (*mem) \ > : "b" (mem), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + __val = ++(*mem); \ > __val; \ > }) > > #define __arch_atomic_decrement_val_32(mem) \ > ({ \ > __typeof (*(mem)) __val; \ > - __asm __volatile ("1: lwarx %0,0,%2\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > + "1: lwarx %0,0,%2\n" \ > " subi %0,%0,1\n" \ > " stwcx. %0,0,%2\n" \ > " bne- 1b" \ > : "=&b" (__val), "=m" (*mem) \ > : "b" (mem), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + __val = --(*mem); \ > __val; \ > }) > > #define __arch_atomic_decrement_if_positive_32(mem) \ > ({ int __val, __tmp; \ > - __asm __volatile ("1: lwarx %0,0,%3\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > + "1: lwarx %0,0,%3\n" \ > " cmpwi 0,%0,0\n" \ > " addi %1,%0,-1\n" \ > " ble 2f\n" \ > @@ -191,6 +238,12 @@ typedef uintmax_t uatomic_max_t; > : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ > : "b" (mem), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = (*mem); \ > + if (__val > 0) \ > + --(*mem); \ > + } \ > __val; \ > }) > > diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h > index 7613bdc..08043a7 100644 > --- a/sysdeps/powerpc/powerpc32/bits/atomic.h > +++ b/sysdeps/powerpc/powerpc32/bits/atomic.h > @@ -17,6 +17,8 @@ > License along with the GNU C Library; if not, see > <http://www.gnu.org/licenses/>. */ > > +#include <tls.h> > + > /* POWER6 adds a "Mutex Hint" to the Load and Reserve instruction. > This is a hint to the hardware to expect additional updates adjacent > to the lock word or not. If we are acquiring a Mutex, the hint > @@ -33,6 +35,14 @@ > # define MUTEX_HINT_REL > #endif > > +/* Check if the process has created a thread. */ > +#ifndef NOT_IN_libc > +# define SINGLE_THREAD_P \ > + (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) > +#else > +# define SINGLE_THREAD_P 0 > +#endif > + > /* > * The 32-bit exchange_bool is different on powerpc64 because the subf > * does signed 64-bit arithmetic while the lwarx is 32-bit unsigned > @@ -42,7 +52,8 @@ > #define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \ > ({ \ > unsigned int __tmp; \ > - __asm __volatile ( \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > "1: lwarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ > " subf. %0,%2,%0\n" \ > " bne 2f\n" \ > @@ -52,13 +63,20 @@ > : "=&r" (__tmp) \ > : "b" (mem), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = !(*mem == oldval); \ > + if (!__tmp) \ > + *mem = newval; \ > + } \ > __tmp != 0; \ > }) > > #define __arch_compare_and_exchange_bool_32_rel(mem, newval, oldval) \ > ({ \ > unsigned int __tmp; \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: lwarx %0,0,%1" MUTEX_HINT_REL "\n" \ > " subf. %0,%2,%0\n" \ > " bne 2f\n" \ > @@ -68,6 +86,12 @@ > : "=&r" (__tmp) \ > : "b" (mem), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = !(*mem == oldval); \ > + if (!__tmp) \ > + *mem = newval; \ > + } \ > __tmp != 0; \ > }) > > diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h > index 527fe7c..0e2fe98 100644 > --- a/sysdeps/powerpc/powerpc64/bits/atomic.h > +++ b/sysdeps/powerpc/powerpc64/bits/atomic.h > @@ -17,6 +17,8 @@ > License along with the GNU C Library; if not, see > <http://www.gnu.org/licenses/>. */ > > +#include <tls.h> > + > /* POWER6 adds a "Mutex Hint" to the Load and Reserve instruction. > This is a hint to the hardware to expect additional updates adjacent > to the lock word or not. If we are acquiring a Mutex, the hint > @@ -33,6 +35,15 @@ > # define MUTEX_HINT_REL > #endif > > +/* Check if the process has created a thread. The lock optimization is only > + for locks within libc.so. */ > +#ifndef NOT_IN_libc > +# define SINGLE_THREAD_P \ > + (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) > +#else > +# define SINGLE_THREAD_P 0 > +#endif > + > /* The 32-bit exchange_bool is different on powerpc64 because the subf > does signed 64-bit arithmetic while the lwarx is 32-bit unsigned > (a load word and zero (high 32) form) load. > @@ -42,7 +53,8 @@ > #define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \ > ({ \ > unsigned int __tmp, __tmp2; \ > - __asm __volatile (" clrldi %1,%1,32\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (" clrldi %1,%1,32\n" \ > "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > " subf. %0,%1,%0\n" \ > " bne 2f\n" \ > @@ -52,13 +64,20 @@ > : "=&r" (__tmp), "=r" (__tmp2) \ > : "b" (mem), "1" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = !(*mem == oldval); \ > + if (!__tmp) \ > + *mem = newval; \ > + } \ > __tmp != 0; \ > }) > > #define __arch_compare_and_exchange_bool_32_rel(mem, newval, oldval) \ > ({ \ > unsigned int __tmp, __tmp2; \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > " clrldi %1,%1,32\n" \ > "1: lwarx %0,0,%2" MUTEX_HINT_REL "\n" \ > " subf. %0,%1,%0\n" \ > @@ -69,6 +88,12 @@ > : "=&r" (__tmp), "=r" (__tmp2) \ > : "b" (mem), "1" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = !(*mem == oldval); \ > + if (!__tmp) \ > + *mem = newval; \ > + } \ > __tmp != 0; \ > }) > > @@ -80,7 +105,8 @@ > #define __arch_compare_and_exchange_bool_64_acq(mem, newval, oldval) \ > ({ \ > unsigned long __tmp; \ > - __asm __volatile ( \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > "1: ldarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ > " subf. %0,%2,%0\n" \ > " bne 2f\n" \ > @@ -90,13 +116,20 @@ > : "=&r" (__tmp) \ > : "b" (mem), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = !(*mem == oldval); \ > + if (!__tmp) \ > + *mem = newval; \ > + } \ > __tmp != 0; \ > }) > > #define __arch_compare_and_exchange_bool_64_rel(mem, newval, oldval) \ > ({ \ > unsigned long __tmp; \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \ > " subf. %0,%2,%0\n" \ > " bne 2f\n" \ > @@ -106,6 +139,12 @@ > : "=&r" (__tmp) \ > : "b" (mem), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = !(*mem == oldval); \ > + if (!__tmp) \ > + *mem = newval; \ > + } \ > __tmp != 0; \ > }) > > @@ -113,7 +152,8 @@ > ({ \ > __typeof (*(mem)) __tmp; \ > __typeof (mem) __memp = (mem); \ > - __asm __volatile ( \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > "1: ldarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ > " cmpd %0,%2\n" \ > " bne 2f\n" \ > @@ -123,6 +163,12 @@ > : "=&r" (__tmp) \ > : "b" (__memp), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = *__memp; \ > + if (__tmp == oldval) \ > + *__memp = newval; \ > + } \ > __tmp; \ > }) > > @@ -130,7 +176,8 @@ > ({ \ > __typeof (*(mem)) __tmp; \ > __typeof (mem) __memp = (mem); \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: ldarx %0,0,%1" MUTEX_HINT_REL "\n" \ > " cmpd %0,%2\n" \ > " bne 2f\n" \ > @@ -140,13 +187,20 @@ > : "=&r" (__tmp) \ > : "b" (__memp), "r" (oldval), "r" (newval) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __tmp = *__memp; \ > + if (__tmp == oldval) \ > + *__memp = newval; \ > + } \ > __tmp; \ > }) > > #define __arch_atomic_exchange_64_acq(mem, value) \ > ({ \ > __typeof (*mem) __val; \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: ldarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > " stdcx. %3,0,%2\n" \ > " bne- 1b\n" \ > @@ -154,64 +208,88 @@ > : "=&r" (__val), "=m" (*mem) \ > : "b" (mem), "r" (value), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *mem; \ > + *mem = value; \ > + } \ > __val; \ > }) > > #define __arch_atomic_exchange_64_rel(mem, value) \ > ({ \ > __typeof (*mem) __val; \ > - __asm __volatile (__ARCH_REL_INSTR "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile (__ARCH_REL_INSTR "\n" \ > "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \ > " stdcx. %3,0,%2\n" \ > " bne- 1b" \ > : "=&r" (__val), "=m" (*mem) \ > : "b" (mem), "r" (value), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *mem; \ > + *mem = value; \ > + } \ > __val; \ > }) > > #define __arch_atomic_exchange_and_add_64(mem, value) \ > ({ \ > __typeof (*mem) __val, __tmp; \ > - __asm __volatile ("1: ldarx %0,0,%3\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ("1: ldarx %0,0,%3\n" \ > " add %1,%0,%4\n" \ > " stdcx. %1,0,%3\n" \ > " bne- 1b" \ > : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ > : "b" (mem), "r" (value), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *mem; \ > + *mem += value; \ > + } \ > __val; \ > }) > > #define __arch_atomic_increment_val_64(mem) \ > ({ \ > __typeof (*(mem)) __val; \ > - __asm __volatile ("1: ldarx %0,0,%2\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ("1: ldarx %0,0,%2\n" \ > " addi %0,%0,1\n" \ > " stdcx. %0,0,%2\n" \ > " bne- 1b" \ > : "=&b" (__val), "=m" (*mem) \ > : "b" (mem), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + __val = ++(*mem); \ > __val; \ > }) > > #define __arch_atomic_decrement_val_64(mem) \ > ({ \ > __typeof (*(mem)) __val; \ > - __asm __volatile ("1: ldarx %0,0,%2\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ("1: ldarx %0,0,%2\n" \ > " subi %0,%0,1\n" \ > " stdcx. %0,0,%2\n" \ > " bne- 1b" \ > : "=&b" (__val), "=m" (*mem) \ > : "b" (mem), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + __val = --(*mem); \ > __val; \ > }) > > #define __arch_atomic_decrement_if_positive_64(mem) \ > ({ int __val, __tmp; \ > - __asm __volatile ("1: ldarx %0,0,%3\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ("1: ldarx %0,0,%3\n" \ > " cmpdi 0,%0,0\n" \ > " addi %1,%0,-1\n" \ > " ble 2f\n" \ > @@ -221,6 +299,12 @@ > : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ > : "b" (mem), "m" (*mem) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = (*mem); \ > + if (__val > 0) \ > + --(*mem); \ > + } \ > __val; \ > }) >
Heretofore sysdeps/CPU/bits/atomic.h is for pure CPU-based implementations. In a few cases there exists a sysdeps/unix/sysv/linux/CPU/bits/atomic.h as well because it needs to use kernel support. This is something somewhere in between: you are not depending directly on specific facilities outside the pure CPU facilities; but you are depending on library infrastructure and associated assumptions that do not hold in the general case of using the atomic macros in arbitrary contexts. Furthermore, you are defining SINGLE_THREAD_P to depend on NPTL implementation details. IMHO neither of these things belong in a sysdeps/CPU/bits/atomic.h file. The lowlevellock.h change doesn't have those issues, so I'd suggest you send that separately and it should go in easily. Thanks, Roland
On 28-04-2014 18:49, Roland McGrath wrote: > Heretofore sysdeps/CPU/bits/atomic.h is for pure CPU-based implementations. > In a few cases there exists a sysdeps/unix/sysv/linux/CPU/bits/atomic.h as > well because it needs to use kernel support. > > This is something somewhere in between: you are not depending directly on > specific facilities outside the pure CPU facilities; but you are depending > on library infrastructure and associated assumptions that do not hold in > the general case of using the atomic macros in arbitrary contexts. > Furthermore, you are defining SINGLE_THREAD_P to depend on NPTL > implementation details. IMHO neither of these things belong in a > sysdeps/CPU/bits/atomic.h file. > > The lowlevellock.h change doesn't have those issues, so I'd suggest you > send that separately and it should go in easily. > > > Thanks, > Roland > I tend to agree with you, however the x86_64 implementation (sysdeps/x86_64/bits/atomic.h) itself relies on NPTL definitions (the check using (offsetof (tcbhead_t, multiple_threads)))). And the idea of changing the atomic.h it to simplify the logic to add this optimization: instead of changing the macros in lowlevellock.h and other atomic usage, it implements the optimization on the atomic itself. I bring this about x86 because usually it is the reference implementation and sometimes puzzles me that copying the same idea in another platform raise architectural question. But I concede that the reference itself maybe had not opted for best solution in first place. So if I have understand correctly, is the optimization to check for single-thread and opt to use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and check for __libc_multiple_threads value instead? This could be a way, however it is redundant in mostly way: the TCP definition already contains the information required, so there it no need to keep track of it in another memory reference. Also, following x86_64 idea, it check for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads in lowlevellock.h. Which is correct guideline for other archs?
On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: > On 28-04-2014 18:49, Roland McGrath wrote: > > Heretofore sysdeps/CPU/bits/atomic.h is for pure CPU-based implementations. > > In a few cases there exists a sysdeps/unix/sysv/linux/CPU/bits/atomic.h as > > well because it needs to use kernel support. > > > > This is something somewhere in between: you are not depending directly on > > specific facilities outside the pure CPU facilities; but you are depending > > on library infrastructure and associated assumptions that do not hold in > > the general case of using the atomic macros in arbitrary contexts. > > Furthermore, you are defining SINGLE_THREAD_P to depend on NPTL > > implementation details. IMHO neither of these things belong in a > > sysdeps/CPU/bits/atomic.h file. > > > > The lowlevellock.h change doesn't have those issues, so I'd suggest you > > send that separately and it should go in easily. > > > > > > Thanks, > > Roland > > > I tend to agree with you, however the x86_64 implementation (sysdeps/x86_64/bits/atomic.h) > itself relies on NPTL definitions (the check using (offsetof (tcbhead_t, multiple_threads)))). > And the idea of changing the atomic.h it to simplify the logic to add this optimization: > instead of changing the macros in lowlevellock.h and other atomic usage, it implements > the optimization on the atomic itself. I agree with Roland that atomic.h shouldn't have the optimization; to me, the strongest reason is that we might need atomics that actually synchronize independently of whether we have spawned a thread or used cancellation. Also, having this optimization in the atomics will make it harder to move to, say, C11 atomics; we'd have to keep the wrappers. > I bring this about x86 because usually it is the reference implementation and sometimes puzzles > me that copying the same idea in another platform raise architectural question. But I concede > that the reference itself maybe had not opted for best solution in first place. > > So if I have understand correctly, is the optimization to check for single-thread and opt to > use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to > mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and > check for __libc_multiple_threads value instead? This could be a way, however it is redundant > in mostly way: the TCP definition already contains the information required, so there it no > need to keep track of it in another memory reference. Also, following x86_64 idea, it check > for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads > in lowlevellock.h. Which is correct guideline for other archs? >From a synchronization perspective, I think any single-thread optimizations belong into the specific concurrent algorithms (e.g., mutexes, condvars, ...) * Doing the optimization at the lowest level (ie, the atomics) might be insufficient because if there's indeed just one thread, then lots of synchronization code can be a lot more simpler than just avoiding atomics (e.g., avoiding loops, checks, ...). * The mutexes, condvars, etc. are what's exposed to the user, so the assumptions of whether there really no concurrency or not just make sense there. For example, a single-thread program can still have a process-shared condvar, so the condvar would need to use synchronization. * We currently don't have other intra-process sources of concurrency than NPTL threads, but if we get another source (e.g., due to trying to support accelerators), we'd likely need low-level synchronization to communicate with the other thing -- and this would be independent of whether we have threads. I'm wondering whether we should move the x86 atomics special case into the concurrent algorithms. Otherwise, C implementations of synchronization operations that using atomics for cross-process synchronization won't work I guess. We currently don't have those on x86 I believe (use of just lowlevellock should be fine), but I suppose things won't stay this way.
On 29-04-2014 13:22, Torvald Riegel wrote: > On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: >> On 28-04-2014 18:49, Roland McGrath wrote: >>> Heretofore sysdeps/CPU/bits/atomic.h is for pure CPU-based implementations. >>> In a few cases there exists a sysdeps/unix/sysv/linux/CPU/bits/atomic.h as >>> well because it needs to use kernel support. >>> >>> This is something somewhere in between: you are not depending directly on >>> specific facilities outside the pure CPU facilities; but you are depending >>> on library infrastructure and associated assumptions that do not hold in >>> the general case of using the atomic macros in arbitrary contexts. >>> Furthermore, you are defining SINGLE_THREAD_P to depend on NPTL >>> implementation details. IMHO neither of these things belong in a >>> sysdeps/CPU/bits/atomic.h file. >>> >>> The lowlevellock.h change doesn't have those issues, so I'd suggest you >>> send that separately and it should go in easily. >>> >>> >>> Thanks, >>> Roland >>> >> I tend to agree with you, however the x86_64 implementation (sysdeps/x86_64/bits/atomic.h) >> itself relies on NPTL definitions (the check using (offsetof (tcbhead_t, multiple_threads)))). >> And the idea of changing the atomic.h it to simplify the logic to add this optimization: >> instead of changing the macros in lowlevellock.h and other atomic usage, it implements >> the optimization on the atomic itself. > I agree with Roland that atomic.h shouldn't have the optimization; to > me, the strongest reason is that we might need atomics that actually > synchronize independently of whether we have spawned a thread or used > cancellation. Also, having this optimization in the atomics will make > it harder to move to, say, C11 atomics; we'd have to keep the wrappers. It does not solve the case for internal usage of atomic operations (for instance, atomic_compare_and_exchange_val_acq in malloc code). The idea of this patch is to mimic x86 optimization, not to refactor atomic case. So the general idea I got so far is: this should not be in atomic primitives, but x8_64 does it anyway. > >> I bring this about x86 because usually it is the reference implementation and sometimes puzzles >> me that copying the same idea in another platform raise architectural question. But I concede >> that the reference itself maybe had not opted for best solution in first place. >> >> So if I have understand correctly, is the optimization to check for single-thread and opt to >> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to >> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and >> check for __libc_multiple_threads value instead? This could be a way, however it is redundant >> in mostly way: the TCP definition already contains the information required, so there it no >> need to keep track of it in another memory reference. Also, following x86_64 idea, it check >> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads >> in lowlevellock.h. Which is correct guideline for other archs? > >From a synchronization perspective, I think any single-thread > optimizations belong into the specific concurrent algorithms (e.g., > mutexes, condvars, ...) > * Doing the optimization at the lowest level (ie, the atomics) might be > insufficient because if there's indeed just one thread, then lots of > synchronization code can be a lot more simpler than just avoiding > atomics (e.g., avoiding loops, checks, ...). > * The mutexes, condvars, etc. are what's exposed to the user, so the > assumptions of whether there really no concurrency or not just make > sense there. For example, a single-thread program can still have a > process-shared condvar, so the condvar would need to use > synchronization. Follow x86_64 idea, this optimization is only for internal atomic usage for libc itself: for a process-shared condvar, one will use pthread code, which is *not* built with this optimization. > * We currently don't have other intra-process sources of concurrency > than NPTL threads, but if we get another source (e.g., due to trying to > support accelerators), we'd likely need low-level synchronization to > communicate with the other thing -- and this would be independent of > whether we have threads. This optimization does not apply to pthread code. > I'm wondering whether we should move the x86 atomics special case into > the concurrent algorithms. Otherwise, C implementations of > synchronization operations that using atomics for cross-process > synchronization won't work I guess. We currently don't have those on > x86 I believe (use of just lowlevellock should be fine), but I suppose > things won't stay this way. >
On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: > On 29-04-2014 13:22, Torvald Riegel wrote: > > On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: > >> On 28-04-2014 18:49, Roland McGrath wrote: > >>> Heretofore sysdeps/CPU/bits/atomic.h is for pure CPU-based implementations. > >>> In a few cases there exists a sysdeps/unix/sysv/linux/CPU/bits/atomic.h as > >>> well because it needs to use kernel support. > >>> > >>> This is something somewhere in between: you are not depending directly on > >>> specific facilities outside the pure CPU facilities; but you are depending > >>> on library infrastructure and associated assumptions that do not hold in > >>> the general case of using the atomic macros in arbitrary contexts. > >>> Furthermore, you are defining SINGLE_THREAD_P to depend on NPTL > >>> implementation details. IMHO neither of these things belong in a > >>> sysdeps/CPU/bits/atomic.h file. > >>> > >>> The lowlevellock.h change doesn't have those issues, so I'd suggest you > >>> send that separately and it should go in easily. > >>> > >>> > >>> Thanks, > >>> Roland > >>> > >> I tend to agree with you, however the x86_64 implementation (sysdeps/x86_64/bits/atomic.h) > >> itself relies on NPTL definitions (the check using (offsetof (tcbhead_t, multiple_threads)))). > >> And the idea of changing the atomic.h it to simplify the logic to add this optimization: > >> instead of changing the macros in lowlevellock.h and other atomic usage, it implements > >> the optimization on the atomic itself. > > I agree with Roland that atomic.h shouldn't have the optimization; to > > me, the strongest reason is that we might need atomics that actually > > synchronize independently of whether we have spawned a thread or used > > cancellation. Also, having this optimization in the atomics will make > > it harder to move to, say, C11 atomics; we'd have to keep the wrappers. > > It does not solve the case for internal usage of atomic operations (for instance, > atomic_compare_and_exchange_val_acq in malloc code). In that case, this optimization should be in the malloc code. > The idea of this patch is to > mimic x86 optimization, not to refactor atomic case. So the general idea I got so > far is: this should not be in atomic primitives, but x8_64 does it anyway. I don't expect you to clean that up, it's just that I think the way x86 does it is not quite right, so duplicating this wouldn't be the right thing to do. > > > >> I bring this about x86 because usually it is the reference implementation and sometimes puzzles > >> me that copying the same idea in another platform raise architectural question. But I concede > >> that the reference itself maybe had not opted for best solution in first place. > >> > >> So if I have understand correctly, is the optimization to check for single-thread and opt to > >> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to > >> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and > >> check for __libc_multiple_threads value instead? This could be a way, however it is redundant > >> in mostly way: the TCP definition already contains the information required, so there it no > >> need to keep track of it in another memory reference. Also, following x86_64 idea, it check > >> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads > >> in lowlevellock.h. Which is correct guideline for other archs? > > >From a synchronization perspective, I think any single-thread > > optimizations belong into the specific concurrent algorithms (e.g., > > mutexes, condvars, ...) > > * Doing the optimization at the lowest level (ie, the atomics) might be > > insufficient because if there's indeed just one thread, then lots of > > synchronization code can be a lot more simpler than just avoiding > > atomics (e.g., avoiding loops, checks, ...). > > * The mutexes, condvars, etc. are what's exposed to the user, so the > > assumptions of whether there really no concurrency or not just make > > sense there. For example, a single-thread program can still have a > > process-shared condvar, so the condvar would need to use > > synchronization. > > Follow x86_64 idea, this optimization is only for internal atomic usage for > libc itself: for a process-shared condvar, one will use pthread code, which > is *not* built with this optimization. pthread code uses the same atomics we use for libc internally. Currently, the x86_64 condvar, for example, doesn't use the atomics -- but this is what we'd need it do to if we ever want to use unified implementations of condvars (e.g., like we did for pthread_once recently). > > > * We currently don't have other intra-process sources of concurrency > > than NPTL threads, but if we get another source (e.g., due to trying to > > support accelerators), we'd likely need low-level synchronization to > > communicate with the other thing -- and this would be independent of > > whether we have threads. > > This optimization does not apply to pthread code. If it uses atomics, it does. condvars, mutexes, rwlocks don't use atomics or barriers IIRC, but semaphores do, for example. We have a custom assembler implementation on x86 for semaphores, so the problem isn't currently present. If we don't want to keep those forever, we'll need the atomics to be useful for process-shared synchronization as well.
On 29-04-2014 14:53, Torvald Riegel wrote: > On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: >> On 29-04-2014 13:22, Torvald Riegel wrote: >>> On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: >>>> On 28-04-2014 18:49, Roland McGrath wrote: >>>>> Heretofore sysdeps/CPU/bits/atomic.h is for pure CPU-based implementations. >>>>> In a few cases there exists a sysdeps/unix/sysv/linux/CPU/bits/atomic.h as >>>>> well because it needs to use kernel support. >>>>> >>>>> This is something somewhere in between: you are not depending directly on >>>>> specific facilities outside the pure CPU facilities; but you are depending >>>>> on library infrastructure and associated assumptions that do not hold in >>>>> the general case of using the atomic macros in arbitrary contexts. >>>>> Furthermore, you are defining SINGLE_THREAD_P to depend on NPTL >>>>> implementation details. IMHO neither of these things belong in a >>>>> sysdeps/CPU/bits/atomic.h file. >>>>> >>>>> The lowlevellock.h change doesn't have those issues, so I'd suggest you >>>>> send that separately and it should go in easily. >>>>> >>>>> >>>>> Thanks, >>>>> Roland >>>>> >>>> I tend to agree with you, however the x86_64 implementation (sysdeps/x86_64/bits/atomic.h) >>>> itself relies on NPTL definitions (the check using (offsetof (tcbhead_t, multiple_threads)))). >>>> And the idea of changing the atomic.h it to simplify the logic to add this optimization: >>>> instead of changing the macros in lowlevellock.h and other atomic usage, it implements >>>> the optimization on the atomic itself. >>> I agree with Roland that atomic.h shouldn't have the optimization; to >>> me, the strongest reason is that we might need atomics that actually >>> synchronize independently of whether we have spawned a thread or used >>> cancellation. Also, having this optimization in the atomics will make >>> it harder to move to, say, C11 atomics; we'd have to keep the wrappers. >> It does not solve the case for internal usage of atomic operations (for instance, >> atomic_compare_and_exchange_val_acq in malloc code). > In that case, this optimization should be in the malloc code. I think is feasible to add this optimization based either on __libc_multiple_threads or in TCB header information. > >> The idea of this patch is to >> mimic x86 optimization, not to refactor atomic case. So the general idea I got so >> far is: this should not be in atomic primitives, but x8_64 does it anyway. > I don't expect you to clean that up, it's just that I think the way x86 > does it is not quite right, so duplicating this wouldn't be the right > thing to do. In that rationale I agree. > >>>> I bring this about x86 because usually it is the reference implementation and sometimes puzzles >>>> me that copying the same idea in another platform raise architectural question. But I concede >>>> that the reference itself maybe had not opted for best solution in first place. >>>> >>>> So if I have understand correctly, is the optimization to check for single-thread and opt to >>>> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to >>>> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and >>>> check for __libc_multiple_threads value instead? This could be a way, however it is redundant >>>> in mostly way: the TCP definition already contains the information required, so there it no >>>> need to keep track of it in another memory reference. Also, following x86_64 idea, it check >>>> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads >>>> in lowlevellock.h. Which is correct guideline for other archs? >>> >From a synchronization perspective, I think any single-thread >>> optimizations belong into the specific concurrent algorithms (e.g., >>> mutexes, condvars, ...) >>> * Doing the optimization at the lowest level (ie, the atomics) might be >>> insufficient because if there's indeed just one thread, then lots of >>> synchronization code can be a lot more simpler than just avoiding >>> atomics (e.g., avoiding loops, checks, ...). >>> * The mutexes, condvars, etc. are what's exposed to the user, so the >>> assumptions of whether there really no concurrency or not just make >>> sense there. For example, a single-thread program can still have a >>> process-shared condvar, so the condvar would need to use >>> synchronization. >> Follow x86_64 idea, this optimization is only for internal atomic usage for >> libc itself: for a process-shared condvar, one will use pthread code, which >> is *not* built with this optimization. > pthread code uses the same atomics we use for libc internally. > Currently, the x86_64 condvar, for example, doesn't use the atomics -- > but this is what we'd need it do to if we ever want to use unified > implementations of condvars (e.g., like we did for pthread_once > recently). If you check my patch, the SINGLE_THREAD_P is defined as: #ifndef NOT_IN_libc # define SINGLE_THREAD_P \ (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) #else # define SINGLE_THREAD_P 0 #endif So for libpthread, the code path to use non atomic will be eliminated. x86_64 is not that careful in some atomic primitives though. >>> * We currently don't have other intra-process sources of concurrency >>> than NPTL threads, but if we get another source (e.g., due to trying to >>> support accelerators), we'd likely need low-level synchronization to >>> communicate with the other thing -- and this would be independent of >>> whether we have threads. >> This optimization does not apply to pthread code. > If it uses atomics, it does. condvars, mutexes, rwlocks don't use > atomics or barriers IIRC, but semaphores do, for example. We have a > custom assembler implementation on x86 for semaphores, so the problem > isn't currently present. If we don't want to keep those forever, we'll > need the atomics to be useful for process-shared synchronization as > well. >
On Tue, 2014-04-08 at 10:26 -0300, Adhemerval Zanella wrote: > This patch adds a single-thread optimization for libc.so locks used > within the shared objects. For each lock operations it checks it the > process has already spawned one thread and if not use non-atomic > operations. Other libraries (libpthread.so for instance) are unaffected > by this change. > > This is similar to x86_64 optimization on locks and atomics by using the > __libc_multiple_threads variable. > > Tested on powerpc32, powerpc64, and powerp64le. > > Note: for macro code change I tried to change as little as possible the > current syntax. > > -- > > * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > (__lll_robust_trylock): Add single-thread lock optimization for calls > within libc.so. See comment below. > * sysdeps/powerpc/bits/atomic.h > (__arch_compare_and_exchange_val_32_acq): Likewise. > (__arch_compare_and_exchange_val_32_rel): Likewise. > (__arch_atomic_exchange_32_acq): Likewise. > (__arch_atomic_exchange_32_rel): Likewise. > (__arch_atomic_exchange_and_add_32): Likewise. > (__arch_atomic_increment_val_32): Likewise. > (__arch_atomic_decrement_val_32): Likewise. > (__arch_atomic_decrement_if_positive_32): Likewise. > * sysdeps/powerpc/powerpc32/bits/atomic.h > (__arch_compare_and_exchange_bool_32_acq): Likewise. > (__arch_compare_and_exchange_bool_32_rel): Likewise. > * sysdeps/powerpc/powerpc64/bits/atomic.h > (__arch_compare_and_exchange_bool_32_acq): Likewise. > (__arch_compare_and_exchange_bool_32_rel): Likewise. > (__arch_compare_and_exchange_bool_64_acq): Likewise. > (__arch_compare_and_exchange_bool_64_rel): Likewise. > (__arch_compare_and_exchange_val_64_acq): Likewise. > (__arch_compare_and_exchange_val_64_rel): Likewise. > (__arch_atomic_exchange_64_acq): Likewise. > (__arch_atomic_exchange_64_rel): Likewise. > (__arch_atomic_exchange_and_add_64): Likewise. > (__arch_atomic_increment_val_64): Likewise. > (__arch_atomic_decrement_val_64): Likewise. > (__arch_atomic_decrement_if_positive_64): Likewise. I think the optimization you attempt here does not belong into the low-level atomics, as I've argued elsewhere in the thread. > --- > > diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > index ab92c3f..419ee2f 100644 > --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h > @@ -205,7 +205,9 @@ > /* Set *futex to ID if it is 0, atomically. Returns the old value */ > #define __lll_robust_trylock(futex, id) \ > ({ int __val; \ > - __asm __volatile ("1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > + if (!SINGLE_THREAD_P) \ > + __asm __volatile ( \ > + "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ > " cmpwi 0,%0,0\n" \ > " bne 2f\n" \ > " stwcx. %3,0,%2\n" \ > @@ -214,6 +216,12 @@ > : "=&r" (__val), "=m" (*futex) \ > : "r" (futex), "r" (id), "m" (*futex) \ > : "cr0", "memory"); \ > + else \ > + { \ > + __val = *futex; \ > + if (__val == 0) \ > + *futex = id; \ > + } \ > __val; \ > }) > The single client of this is in pthread_mutex_trylock.c. If there's actually a need to do this, it belongs here, not in each arch's lowlevellock.h.
Hi Torvald, thanks for the review. I have dropped this patch and sent another one that focus optimization only in lowlevellock.h instead of changing the atomic.h: https://sourceware.org/ml/libc-alpha/2014-04/msg00671.html https://sourceware.org/ml/libc-alpha/2014-04/msg00672.html On 02-05-2014 10:11, Torvald Riegel wrote: > On Tue, 2014-04-08 at 10:26 -0300, Adhemerval Zanella wrote: >> This patch adds a single-thread optimization for libc.so locks used >> within the shared objects. For each lock operations it checks it the >> process has already spawned one thread and if not use non-atomic >> operations. Other libraries (libpthread.so for instance) are unaffected >> by this change. >> >> This is similar to x86_64 optimization on locks and atomics by using the >> __libc_multiple_threads variable. >> >> Tested on powerpc32, powerpc64, and powerp64le. >> >> Note: for macro code change I tried to change as little as possible the >> current syntax. >> >> -- >> >> * nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h >> (__lll_robust_trylock): Add single-thread lock optimization for calls >> within libc.so. > See comment below. > >> * sysdeps/powerpc/bits/atomic.h >> (__arch_compare_and_exchange_val_32_acq): Likewise. >> (__arch_compare_and_exchange_val_32_rel): Likewise. >> (__arch_atomic_exchange_32_acq): Likewise. >> (__arch_atomic_exchange_32_rel): Likewise. >> (__arch_atomic_exchange_and_add_32): Likewise. >> (__arch_atomic_increment_val_32): Likewise. >> (__arch_atomic_decrement_val_32): Likewise. >> (__arch_atomic_decrement_if_positive_32): Likewise. >> * sysdeps/powerpc/powerpc32/bits/atomic.h >> (__arch_compare_and_exchange_bool_32_acq): Likewise. >> (__arch_compare_and_exchange_bool_32_rel): Likewise. >> * sysdeps/powerpc/powerpc64/bits/atomic.h >> (__arch_compare_and_exchange_bool_32_acq): Likewise. >> (__arch_compare_and_exchange_bool_32_rel): Likewise. >> (__arch_compare_and_exchange_bool_64_acq): Likewise. >> (__arch_compare_and_exchange_bool_64_rel): Likewise. >> (__arch_compare_and_exchange_val_64_acq): Likewise. >> (__arch_compare_and_exchange_val_64_rel): Likewise. >> (__arch_atomic_exchange_64_acq): Likewise. >> (__arch_atomic_exchange_64_rel): Likewise. >> (__arch_atomic_exchange_and_add_64): Likewise. >> (__arch_atomic_increment_val_64): Likewise. >> (__arch_atomic_decrement_val_64): Likewise. >> (__arch_atomic_decrement_if_positive_64): Likewise. > I think the optimization you attempt here does not belong into the > low-level atomics, as I've argued elsewhere in the thread. > >> --- >> >> diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h >> index ab92c3f..419ee2f 100644 >> --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h >> +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h >> @@ -205,7 +205,9 @@ >> /* Set *futex to ID if it is 0, atomically. Returns the old value */ >> #define __lll_robust_trylock(futex, id) \ >> ({ int __val; \ >> - __asm __volatile ("1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ >> + if (!SINGLE_THREAD_P) \ >> + __asm __volatile ( \ >> + "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ >> " cmpwi 0,%0,0\n" \ >> " bne 2f\n" \ >> " stwcx. %3,0,%2\n" \ >> @@ -214,6 +216,12 @@ >> : "=&r" (__val), "=m" (*futex) \ >> : "r" (futex), "r" (id), "m" (*futex) \ >> : "cr0", "memory"); \ >> + else \ >> + { \ >> + __val = *futex; \ >> + if (__val == 0) \ >> + *futex = id; \ >> + } \ >> __val; \ >> }) >> > The single client of this is in pthread_mutex_trylock.c. If there's > actually a need to do this, it belongs here, not in each arch's > lowlevellock.h. > >
On Tue, 2014-04-29 at 15:05 -0300, Adhemerval Zanella wrote: > On 29-04-2014 14:53, Torvald Riegel wrote: > > On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: > >> On 29-04-2014 13:22, Torvald Riegel wrote: > >>> On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: > >>>> I bring this about x86 because usually it is the reference implementation and sometimes puzzles > >>>> me that copying the same idea in another platform raise architectural question. But I concede > >>>> that the reference itself maybe had not opted for best solution in first place. > >>>> > >>>> So if I have understand correctly, is the optimization to check for single-thread and opt to > >>>> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to > >>>> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and > >>>> check for __libc_multiple_threads value instead? This could be a way, however it is redundant > >>>> in mostly way: the TCP definition already contains the information required, so there it no > >>>> need to keep track of it in another memory reference. Also, following x86_64 idea, it check > >>>> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads > >>>> in lowlevellock.h. Which is correct guideline for other archs? > >>> >From a synchronization perspective, I think any single-thread > >>> optimizations belong into the specific concurrent algorithms (e.g., > >>> mutexes, condvars, ...) > >>> * Doing the optimization at the lowest level (ie, the atomics) might be > >>> insufficient because if there's indeed just one thread, then lots of > >>> synchronization code can be a lot more simpler than just avoiding > >>> atomics (e.g., avoiding loops, checks, ...). > >>> * The mutexes, condvars, etc. are what's exposed to the user, so the > >>> assumptions of whether there really no concurrency or not just make > >>> sense there. For example, a single-thread program can still have a > >>> process-shared condvar, so the condvar would need to use > >>> synchronization. > >> Follow x86_64 idea, this optimization is only for internal atomic usage for > >> libc itself: for a process-shared condvar, one will use pthread code, which > >> is *not* built with this optimization. > > pthread code uses the same atomics we use for libc internally. > > Currently, the x86_64 condvar, for example, doesn't use the atomics -- > > but this is what we'd need it do to if we ever want to use unified > > implementations of condvars (e.g., like we did for pthread_once > > recently). > > If you check my patch, the SINGLE_THREAD_P is defined as: > > #ifndef NOT_IN_libc > # define SINGLE_THREAD_P \ > (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) > #else > # define SINGLE_THREAD_P 0 > #endif > > So for libpthread, the code path to use non atomic will be eliminated. x86_64 is > not that careful in some atomic primitives though. I think that's not sufficient, nor are the low-level atomics the right place for this kind of optimization. First, there are several source of concurrency affecting shared-memory synchronization: * Threads created by nptl. * Other processes we're interacting with via shared memory. * Reentrancy. * The kernel, if we should synchronize with it via shared memory (e.g., recent perf does so, IIRC). We control the first. The second case is, I suppose, only reachable by using pthreads pshared sync operations (or not?). In case of reentrancy, there is concurrency between a signal handler and a process consisting of a single thread, so we might want to use atomics to synchronize. I haven't checked whether we actually do (Alex might know after doing the MT-Safety documentation) -- but I would not want us preventing from using atomics for that, so a check on just multiple_threads is not sufficient IMO. Something similar applies to the kernel case. Or if, in the future, we should want to sync with any accelerators or similar. Therefore, I think we need to have atomics that always synchronize, even if we've just got one thread so far. If we want to do the optimization you want to do, I think we need the user to clarify which sources of concurrency cannot be present in the respective piece of code. This also drives how we can weaken the atomics: * If there's only reentrancy as source of concurrency, we still need read-modify-write ops, atomic writes, etc., but can skip all HW barriers. (Assuming that signal handler execution actually establishes a happens-before with the interrupted thread.) * If there's no source of concurrency at all, we can just write sequential code. Those weaker versions should, IMO, be available as wrappers or separate functions, so that the full atomics are always available. Something like catomic_add and similar on x86_64, except that we'd might want to have a more descriptive name, and distinguish between the no-concurrency-at-all case and the no-concurrency-except-reentrancy cases (the latter being what catomic_add falls into). The next issue I see is whether we'd actually want the sequential code (ie, for no-concurrency-at-all) to be provided in the form of a variant of atomics or directly in the code using it. Based on a quick grep for the atomics, I see about 20 files that use atomics (ignoring nptl). Quite a few of those seem to use atomics in concurrent code that goes beyond a single increment of a counter or such, so they might benefit performance-wise from having an actually sequential version of the code, not just "sequential atomics". The amount lines of code defining atomics is significantly larger than the uses. This indicates that it might be better to put those optimizations into the code that uses atomics. The code would have to be reviewed anyway to see which sources of concurrency it faces, and would probably have to be changed to add the selection of the appropriate atomics for that (if we should add the optimization to the atomics). Finally, any such optimization will add a branch on each use of atomics, and depending on how they are built (e.g., asm or C), the compiler might or might not be able to merge those. I'd like to see at least some indication that we're not significantly slowing down multi-threaded code to get some benefit on single-threaded code. We also might want to consider whether we'd want to put a glibc_likely on those branches, favoring multi-threaded code (in my opinion) or single-threaded code (considering the amount of single-threaded code we have today).
On 02-05-2014 11:04, Torvald Riegel wrote: > On Tue, 2014-04-29 at 15:05 -0300, Adhemerval Zanella wrote: >> On 29-04-2014 14:53, Torvald Riegel wrote: >>> On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: >>>> On 29-04-2014 13:22, Torvald Riegel wrote: >>>>> On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: >>>>>> I bring this about x86 because usually it is the reference implementation and sometimes puzzles >>>>>> me that copying the same idea in another platform raise architectural question. But I concede >>>>>> that the reference itself maybe had not opted for best solution in first place. >>>>>> >>>>>> So if I have understand correctly, is the optimization to check for single-thread and opt to >>>>>> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to >>>>>> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and >>>>>> check for __libc_multiple_threads value instead? This could be a way, however it is redundant >>>>>> in mostly way: the TCP definition already contains the information required, so there it no >>>>>> need to keep track of it in another memory reference. Also, following x86_64 idea, it check >>>>>> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads >>>>>> in lowlevellock.h. Which is correct guideline for other archs? >>>>> >From a synchronization perspective, I think any single-thread >>>>> optimizations belong into the specific concurrent algorithms (e.g., >>>>> mutexes, condvars, ...) >>>>> * Doing the optimization at the lowest level (ie, the atomics) might be >>>>> insufficient because if there's indeed just one thread, then lots of >>>>> synchronization code can be a lot more simpler than just avoiding >>>>> atomics (e.g., avoiding loops, checks, ...). >>>>> * The mutexes, condvars, etc. are what's exposed to the user, so the >>>>> assumptions of whether there really no concurrency or not just make >>>>> sense there. For example, a single-thread program can still have a >>>>> process-shared condvar, so the condvar would need to use >>>>> synchronization. >>>> Follow x86_64 idea, this optimization is only for internal atomic usage for >>>> libc itself: for a process-shared condvar, one will use pthread code, which >>>> is *not* built with this optimization. >>> pthread code uses the same atomics we use for libc internally. >>> Currently, the x86_64 condvar, for example, doesn't use the atomics -- >>> but this is what we'd need it do to if we ever want to use unified >>> implementations of condvars (e.g., like we did for pthread_once >>> recently). >> If you check my patch, the SINGLE_THREAD_P is defined as: >> >> #ifndef NOT_IN_libc >> # define SINGLE_THREAD_P \ >> (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) >> #else >> # define SINGLE_THREAD_P 0 >> #endif >> >> So for libpthread, the code path to use non atomic will be eliminated. x86_64 is >> not that careful in some atomic primitives though. > I think that's not sufficient, nor are the low-level atomics the right > place for this kind of optimization. > > First, there are several source of concurrency affecting shared-memory > synchronization: > * Threads created by nptl. > * Other processes we're interacting with via shared memory. > * Reentrancy. > * The kernel, if we should synchronize with it via shared memory (e.g., > recent perf does so, IIRC). > > We control the first. The second case is, I suppose, only reachable by > using pthreads pshared sync operations (or not?). > > In case of reentrancy, there is concurrency between a signal handler and > a process consisting of a single thread, so we might want to use atomics > to synchronize. I haven't checked whether we actually do (Alex might > know after doing the MT-Safety documentation) -- but I would not want us > preventing from using atomics for that, so a check on just > multiple_threads is not sufficient IMO. > Something similar applies to the kernel case. Or if, in the future, we > should want to sync with any accelerators or similar. As I stated previously, I have dropped to modify the atomic.h in favor of just the lowlevellock.h. And I think we need to reevaluate then the x86_64 code that does exactly what you think is wrong (add the single-thread opt on atomics). > > Therefore, I think we need to have atomics that always synchronize, even > if we've just got one thread so far. > > If we want to do the optimization you want to do, I think we need the > user to clarify which sources of concurrency cannot be present in the > respective piece of code. This also drives how we can weaken the > atomics: > * If there's only reentrancy as source of concurrency, we still need > read-modify-write ops, atomic writes, etc., but can skip all HW > barriers. (Assuming that signal handler execution actually establishes > a happens-before with the interrupted thread.) > * If there's no source of concurrency at all, we can just write > sequential code. > > Those weaker versions should, IMO, be available as wrappers or separate > functions, so that the full atomics are always available. Something > like catomic_add and similar on x86_64, except that we'd might want to > have a more descriptive name, and distinguish between the > no-concurrency-at-all case and the no-concurrency-except-reentrancy > cases (the latter being what catomic_add falls into). > > > The next issue I see is whether we'd actually want the sequential code > (ie, for no-concurrency-at-all) to be provided in the form of a variant > of atomics or directly in the code using it. Based on a quick grep for > the atomics, I see about 20 files that use atomics (ignoring nptl). > Quite a few of those seem to use atomics in concurrent code that goes > beyond a single increment of a counter or such, so they might benefit > performance-wise from having an actually sequential version of the code, > not just "sequential atomics". > The amount lines of code defining atomics is significantly larger than > the uses. > > This indicates that it might be better to put those optimizations into > the code that uses atomics. The code would have to be reviewed anyway > to see which sources of concurrency it faces, and would probably have to > be changed to add the selection of the appropriate atomics for that (if > we should add the optimization to the atomics). That's my idea of the malloc patch I have sent, instead of change atomics, change the way the atomics is being used. > > > Finally, any such optimization will add a branch on each use of atomics, > and depending on how they are built (e.g., asm or C), the compiler might > or might not be able to merge those. I'd like to see at least some > indication that we're not significantly slowing down multi-threaded code > to get some benefit on single-threaded code. > We also might want to consider whether we'd want to put a glibc_likely > on those branches, favoring multi-threaded code (in my opinion) or > single-threaded code (considering the amount of single-threaded code we > have today). >
On Fri, 2014-05-02 at 11:15 -0300, Adhemerval Zanella wrote: > On 02-05-2014 11:04, Torvald Riegel wrote: > > On Tue, 2014-04-29 at 15:05 -0300, Adhemerval Zanella wrote: > >> On 29-04-2014 14:53, Torvald Riegel wrote: > >>> On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: > >>>> On 29-04-2014 13:22, Torvald Riegel wrote: > >>>>> On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: > >>>>>> I bring this about x86 because usually it is the reference implementation and sometimes puzzles > >>>>>> me that copying the same idea in another platform raise architectural question. But I concede > >>>>>> that the reference itself maybe had not opted for best solution in first place. > >>>>>> > >>>>>> So if I have understand correctly, is the optimization to check for single-thread and opt to > >>>>>> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to > >>>>>> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and > >>>>>> check for __libc_multiple_threads value instead? This could be a way, however it is redundant > >>>>>> in mostly way: the TCP definition already contains the information required, so there it no > >>>>>> need to keep track of it in another memory reference. Also, following x86_64 idea, it check > >>>>>> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads > >>>>>> in lowlevellock.h. Which is correct guideline for other archs? > >>>>> >From a synchronization perspective, I think any single-thread > >>>>> optimizations belong into the specific concurrent algorithms (e.g., > >>>>> mutexes, condvars, ...) > >>>>> * Doing the optimization at the lowest level (ie, the atomics) might be > >>>>> insufficient because if there's indeed just one thread, then lots of > >>>>> synchronization code can be a lot more simpler than just avoiding > >>>>> atomics (e.g., avoiding loops, checks, ...). > >>>>> * The mutexes, condvars, etc. are what's exposed to the user, so the > >>>>> assumptions of whether there really no concurrency or not just make > >>>>> sense there. For example, a single-thread program can still have a > >>>>> process-shared condvar, so the condvar would need to use > >>>>> synchronization. > >>>> Follow x86_64 idea, this optimization is only for internal atomic usage for > >>>> libc itself: for a process-shared condvar, one will use pthread code, which > >>>> is *not* built with this optimization. > >>> pthread code uses the same atomics we use for libc internally. > >>> Currently, the x86_64 condvar, for example, doesn't use the atomics -- > >>> but this is what we'd need it do to if we ever want to use unified > >>> implementations of condvars (e.g., like we did for pthread_once > >>> recently). > >> If you check my patch, the SINGLE_THREAD_P is defined as: > >> > >> #ifndef NOT_IN_libc > >> # define SINGLE_THREAD_P \ > >> (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) > >> #else > >> # define SINGLE_THREAD_P 0 > >> #endif > >> > >> So for libpthread, the code path to use non atomic will be eliminated. x86_64 is > >> not that careful in some atomic primitives though. > > I think that's not sufficient, nor are the low-level atomics the right > > place for this kind of optimization. > > > > First, there are several source of concurrency affecting shared-memory > > synchronization: > > * Threads created by nptl. > > * Other processes we're interacting with via shared memory. > > * Reentrancy. > > * The kernel, if we should synchronize with it via shared memory (e.g., > > recent perf does so, IIRC). > > > > We control the first. The second case is, I suppose, only reachable by > > using pthreads pshared sync operations (or not?). > > > > In case of reentrancy, there is concurrency between a signal handler and > > a process consisting of a single thread, so we might want to use atomics > > to synchronize. I haven't checked whether we actually do (Alex might > > know after doing the MT-Safety documentation) -- but I would not want us > > preventing from using atomics for that, so a check on just > > multiple_threads is not sufficient IMO. > > Something similar applies to the kernel case. Or if, in the future, we > > should want to sync with any accelerators or similar. > > As I stated previously, I have dropped to modify the atomic.h in favor of just > the lowlevellock.h. > > And I think we need to reevaluate then the x86_64 code that does exactly what > you think is wrong (add the single-thread opt on atomics). Note that those are different: They drop the "lock" prefix, but they are not sequential code like what you add. I agree that it's worth documenting them, but those should work in case of reentrancy.
On 02-05-2014 11:37, Torvald Riegel wrote: > On Fri, 2014-05-02 at 11:15 -0300, Adhemerval Zanella wrote: >> On 02-05-2014 11:04, Torvald Riegel wrote: >>> On Tue, 2014-04-29 at 15:05 -0300, Adhemerval Zanella wrote: >>>> On 29-04-2014 14:53, Torvald Riegel wrote: >>>>> On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: >>>>>> On 29-04-2014 13:22, Torvald Riegel wrote: >>>>>>> On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: >>>>>>>> I bring this about x86 because usually it is the reference implementation and sometimes puzzles >>>>>>>> me that copying the same idea in another platform raise architectural question. But I concede >>>>>>>> that the reference itself maybe had not opted for best solution in first place. >>>>>>>> >>>>>>>> So if I have understand correctly, is the optimization to check for single-thread and opt to >>>>>>>> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to >>>>>>>> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and >>>>>>>> check for __libc_multiple_threads value instead? This could be a way, however it is redundant >>>>>>>> in mostly way: the TCP definition already contains the information required, so there it no >>>>>>>> need to keep track of it in another memory reference. Also, following x86_64 idea, it check >>>>>>>> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads >>>>>>>> in lowlevellock.h. Which is correct guideline for other archs? >>>>>>> >From a synchronization perspective, I think any single-thread >>>>>>> optimizations belong into the specific concurrent algorithms (e.g., >>>>>>> mutexes, condvars, ...) >>>>>>> * Doing the optimization at the lowest level (ie, the atomics) might be >>>>>>> insufficient because if there's indeed just one thread, then lots of >>>>>>> synchronization code can be a lot more simpler than just avoiding >>>>>>> atomics (e.g., avoiding loops, checks, ...). >>>>>>> * The mutexes, condvars, etc. are what's exposed to the user, so the >>>>>>> assumptions of whether there really no concurrency or not just make >>>>>>> sense there. For example, a single-thread program can still have a >>>>>>> process-shared condvar, so the condvar would need to use >>>>>>> synchronization. >>>>>> Follow x86_64 idea, this optimization is only for internal atomic usage for >>>>>> libc itself: for a process-shared condvar, one will use pthread code, which >>>>>> is *not* built with this optimization. >>>>> pthread code uses the same atomics we use for libc internally. >>>>> Currently, the x86_64 condvar, for example, doesn't use the atomics -- >>>>> but this is what we'd need it do to if we ever want to use unified >>>>> implementations of condvars (e.g., like we did for pthread_once >>>>> recently). >>>> If you check my patch, the SINGLE_THREAD_P is defined as: >>>> >>>> #ifndef NOT_IN_libc >>>> # define SINGLE_THREAD_P \ >>>> (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) >>>> #else >>>> # define SINGLE_THREAD_P 0 >>>> #endif >>>> >>>> So for libpthread, the code path to use non atomic will be eliminated. x86_64 is >>>> not that careful in some atomic primitives though. >>> I think that's not sufficient, nor are the low-level atomics the right >>> place for this kind of optimization. >>> >>> First, there are several source of concurrency affecting shared-memory >>> synchronization: >>> * Threads created by nptl. >>> * Other processes we're interacting with via shared memory. >>> * Reentrancy. >>> * The kernel, if we should synchronize with it via shared memory (e.g., >>> recent perf does so, IIRC). >>> >>> We control the first. The second case is, I suppose, only reachable by >>> using pthreads pshared sync operations (or not?). >>> >>> In case of reentrancy, there is concurrency between a signal handler and >>> a process consisting of a single thread, so we might want to use atomics >>> to synchronize. I haven't checked whether we actually do (Alex might >>> know after doing the MT-Safety documentation) -- but I would not want us >>> preventing from using atomics for that, so a check on just >>> multiple_threads is not sufficient IMO. >>> Something similar applies to the kernel case. Or if, in the future, we >>> should want to sync with any accelerators or similar. >> As I stated previously, I have dropped to modify the atomic.h in favor of just >> the lowlevellock.h. >> >> And I think we need to reevaluate then the x86_64 code that does exactly what >> you think is wrong (add the single-thread opt on atomics). > Note that those are different: They drop the "lock" prefix, but they are > not sequential code like what you add. > I agree that it's worth documenting them, but those should work in case > of reentrancy. > In fact I am arguing about the x86_64 atomic.h implementation because from this discussion I understood the good practice for an architecture implementation is not to add such optimization on atomic itself, but rather than in the atomics usage (such as lowlevellock.h). And correct if I'm wrong, but I think it would be concurrency in reentrancy only if pthread_create (which is the function that will change tcb::multiple_threads or __libc_multiple_threads) would be called inside the signal handler. However pthread_create (and majority of pthread functions) are not signal-safe.
On Fri, 2014-05-02 at 12:01 -0300, Adhemerval Zanella wrote: > On 02-05-2014 11:37, Torvald Riegel wrote: > > On Fri, 2014-05-02 at 11:15 -0300, Adhemerval Zanella wrote: > >> On 02-05-2014 11:04, Torvald Riegel wrote: > >>> On Tue, 2014-04-29 at 15:05 -0300, Adhemerval Zanella wrote: > >>>> On 29-04-2014 14:53, Torvald Riegel wrote: > >>>>> On Tue, 2014-04-29 at 13:49 -0300, Adhemerval Zanella wrote: > >>>>>> On 29-04-2014 13:22, Torvald Riegel wrote: > >>>>>>> On Mon, 2014-04-28 at 19:33 -0300, Adhemerval Zanella wrote: > >>>>>>>> I bring this about x86 because usually it is the reference implementation and sometimes puzzles > >>>>>>>> me that copying the same idea in another platform raise architectural question. But I concede > >>>>>>>> that the reference itself maybe had not opted for best solution in first place. > >>>>>>>> > >>>>>>>> So if I have understand correctly, is the optimization to check for single-thread and opt to > >>>>>>>> use locks is to focused on lowlevellock solely? If so, how do you suggest to other archs to > >>>>>>>> mimic x86 optimization on atomic.h primitives? Should other arch follow the x86_64 and > >>>>>>>> check for __libc_multiple_threads value instead? This could be a way, however it is redundant > >>>>>>>> in mostly way: the TCP definition already contains the information required, so there it no > >>>>>>>> need to keep track of it in another memory reference. Also, following x86_64 idea, it check > >>>>>>>> for TCB header information for sysdeps/CPU/bits/atomic.h, but for __libc_multiple_threads > >>>>>>>> in lowlevellock.h. Which is correct guideline for other archs? > >>>>>>> >From a synchronization perspective, I think any single-thread > >>>>>>> optimizations belong into the specific concurrent algorithms (e.g., > >>>>>>> mutexes, condvars, ...) > >>>>>>> * Doing the optimization at the lowest level (ie, the atomics) might be > >>>>>>> insufficient because if there's indeed just one thread, then lots of > >>>>>>> synchronization code can be a lot more simpler than just avoiding > >>>>>>> atomics (e.g., avoiding loops, checks, ...). > >>>>>>> * The mutexes, condvars, etc. are what's exposed to the user, so the > >>>>>>> assumptions of whether there really no concurrency or not just make > >>>>>>> sense there. For example, a single-thread program can still have a > >>>>>>> process-shared condvar, so the condvar would need to use > >>>>>>> synchronization. > >>>>>> Follow x86_64 idea, this optimization is only for internal atomic usage for > >>>>>> libc itself: for a process-shared condvar, one will use pthread code, which > >>>>>> is *not* built with this optimization. > >>>>> pthread code uses the same atomics we use for libc internally. > >>>>> Currently, the x86_64 condvar, for example, doesn't use the atomics -- > >>>>> but this is what we'd need it do to if we ever want to use unified > >>>>> implementations of condvars (e.g., like we did for pthread_once > >>>>> recently). > >>>> If you check my patch, the SINGLE_THREAD_P is defined as: > >>>> > >>>> #ifndef NOT_IN_libc > >>>> # define SINGLE_THREAD_P \ > >>>> (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) > >>>> #else > >>>> # define SINGLE_THREAD_P 0 > >>>> #endif > >>>> > >>>> So for libpthread, the code path to use non atomic will be eliminated. x86_64 is > >>>> not that careful in some atomic primitives though. > >>> I think that's not sufficient, nor are the low-level atomics the right > >>> place for this kind of optimization. > >>> > >>> First, there are several source of concurrency affecting shared-memory > >>> synchronization: > >>> * Threads created by nptl. > >>> * Other processes we're interacting with via shared memory. > >>> * Reentrancy. > >>> * The kernel, if we should synchronize with it via shared memory (e.g., > >>> recent perf does so, IIRC). > >>> > >>> We control the first. The second case is, I suppose, only reachable by > >>> using pthreads pshared sync operations (or not?). > >>> > >>> In case of reentrancy, there is concurrency between a signal handler and > >>> a process consisting of a single thread, so we might want to use atomics > >>> to synchronize. I haven't checked whether we actually do (Alex might > >>> know after doing the MT-Safety documentation) -- but I would not want us > >>> preventing from using atomics for that, so a check on just > >>> multiple_threads is not sufficient IMO. > >>> Something similar applies to the kernel case. Or if, in the future, we > >>> should want to sync with any accelerators or similar. > >> As I stated previously, I have dropped to modify the atomic.h in favor of just > >> the lowlevellock.h. > >> > >> And I think we need to reevaluate then the x86_64 code that does exactly what > >> you think is wrong (add the single-thread opt on atomics). > > Note that those are different: They drop the "lock" prefix, but they are > > not sequential code like what you add. > > I agree that it's worth documenting them, but those should work in case > > of reentrancy. > > > In fact I am arguing about the x86_64 atomic.h implementation because from this > discussion I understood the good practice for an architecture implementation is > not to add such optimization on atomic itself, but rather than in the atomics > usage (such as lowlevellock.h). That I agree on, partially. The catomic* functions do need different atomics, because they just drop the "lock" prefix. AFAIK x86, this makes them indivisible in terms of execution (so safe wrt. interruption by signal handlers), but not atomic in presence of other concurrency. This would be similar to keeping LL-SC but removing all memory barriers. Thus, if a client would like use atomics for just being safe in face of reentrancy, then those atomics would be required to achieve this. > And correct if I'm wrong, but I think it would be concurrency in reentrancy only > if pthread_create (which is the function that will change tcb::multiple_threads > or __libc_multiple_threads) would be called inside the signal handler. The fact that pthread_create changes tcb::multiple_threads is something that should be documented as making pthread_create AS-Unsafe (amongst other reasons). Concurrency due to reentrancy can happen without additional threads. If X() is supposed to be AS-Safe, and we have X() being interrupted by a signal handler that also executes X(), then we have one execution of X concurrent with another one. > However > pthread_create (and majority of pthread functions) are not signal-safe. BTW, those are currently not documented. Is that still on someone's todo list?
diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h index ab92c3f..419ee2f 100644 --- a/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h +++ b/nptl/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h @@ -205,7 +205,9 @@ /* Set *futex to ID if it is 0, atomically. Returns the old value */ #define __lll_robust_trylock(futex, id) \ ({ int __val; \ - __asm __volatile ("1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ + "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ " cmpwi 0,%0,0\n" \ " bne 2f\n" \ " stwcx. %3,0,%2\n" \ @@ -214,6 +216,12 @@ : "=&r" (__val), "=m" (*futex) \ : "r" (futex), "r" (id), "m" (*futex) \ : "cr0", "memory"); \ + else \ + { \ + __val = *futex; \ + if (__val == 0) \ + *futex = id; \ + } \ __val; \ }) diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h index 2ffba48..2d31411 100644 --- a/sysdeps/powerpc/bits/atomic.h +++ b/sysdeps/powerpc/bits/atomic.h @@ -76,6 +76,10 @@ typedef uintmax_t uatomic_max_t; # define MUTEX_HINT_REL #endif +/* Note: SINGLE_THREAD_P is defined either in + sysdeps/powerpc/powerpc64/bits/atomic.h or + sysdeps/powerpc/powerpc32/bits/atomic.h */ + #define atomic_full_barrier() __asm ("sync" ::: "memory") #define atomic_write_barrier() __asm ("eieio" ::: "memory") @@ -83,7 +87,8 @@ typedef uintmax_t uatomic_max_t; ({ \ __typeof (*(mem)) __tmp; \ __typeof (mem) __memp = (mem); \ - __asm __volatile ( \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ "1: lwarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ " cmpw %0,%2\n" \ " bne 2f\n" \ @@ -93,6 +98,12 @@ typedef uintmax_t uatomic_max_t; : "=&r" (__tmp) \ : "b" (__memp), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = *__memp; \ + if (__tmp == oldval) \ + *__memp = newval; \ + } \ __tmp; \ }) @@ -100,7 +111,8 @@ typedef uintmax_t uatomic_max_t; ({ \ __typeof (*(mem)) __tmp; \ __typeof (mem) __memp = (mem); \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: lwarx %0,0,%1" MUTEX_HINT_REL "\n" \ " cmpw %0,%2\n" \ " bne 2f\n" \ @@ -110,13 +122,20 @@ typedef uintmax_t uatomic_max_t; : "=&r" (__tmp) \ : "b" (__memp), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = *__memp; \ + if (__tmp == oldval) \ + *__memp = newval; \ + } \ __tmp; \ }) #define __arch_atomic_exchange_32_acq(mem, value) \ ({ \ __typeof (*mem) __val; \ - __asm __volatile ( \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ " stwcx. %3,0,%2\n" \ " bne- 1b\n" \ @@ -124,64 +143,92 @@ typedef uintmax_t uatomic_max_t; : "=&r" (__val), "=m" (*mem) \ : "b" (mem), "r" (value), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = *mem; \ + *mem = value; \ + } \ __val; \ }) #define __arch_atomic_exchange_32_rel(mem, value) \ ({ \ __typeof (*mem) __val; \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: lwarx %0,0,%2" MUTEX_HINT_REL "\n" \ " stwcx. %3,0,%2\n" \ " bne- 1b" \ : "=&r" (__val), "=m" (*mem) \ : "b" (mem), "r" (value), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = *mem; \ + *mem = value; \ + } \ __val; \ }) #define __arch_atomic_exchange_and_add_32(mem, value) \ ({ \ __typeof (*mem) __val, __tmp; \ - __asm __volatile ("1: lwarx %0,0,%3\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ + "1: lwarx %0,0,%3\n" \ " add %1,%0,%4\n" \ " stwcx. %1,0,%3\n" \ " bne- 1b" \ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ : "b" (mem), "r" (value), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = *mem; \ + *mem += value; \ + } \ __val; \ }) #define __arch_atomic_increment_val_32(mem) \ ({ \ __typeof (*(mem)) __val; \ - __asm __volatile ("1: lwarx %0,0,%2\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ + "1: lwarx %0,0,%2\n" \ " addi %0,%0,1\n" \ " stwcx. %0,0,%2\n" \ " bne- 1b" \ : "=&b" (__val), "=m" (*mem) \ : "b" (mem), "m" (*mem) \ : "cr0", "memory"); \ + else \ + __val = ++(*mem); \ __val; \ }) #define __arch_atomic_decrement_val_32(mem) \ ({ \ __typeof (*(mem)) __val; \ - __asm __volatile ("1: lwarx %0,0,%2\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ + "1: lwarx %0,0,%2\n" \ " subi %0,%0,1\n" \ " stwcx. %0,0,%2\n" \ " bne- 1b" \ : "=&b" (__val), "=m" (*mem) \ : "b" (mem), "m" (*mem) \ : "cr0", "memory"); \ + else \ + __val = --(*mem); \ __val; \ }) #define __arch_atomic_decrement_if_positive_32(mem) \ ({ int __val, __tmp; \ - __asm __volatile ("1: lwarx %0,0,%3\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ + "1: lwarx %0,0,%3\n" \ " cmpwi 0,%0,0\n" \ " addi %1,%0,-1\n" \ " ble 2f\n" \ @@ -191,6 +238,12 @@ typedef uintmax_t uatomic_max_t; : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ : "b" (mem), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = (*mem); \ + if (__val > 0) \ + --(*mem); \ + } \ __val; \ }) diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h index 7613bdc..08043a7 100644 --- a/sysdeps/powerpc/powerpc32/bits/atomic.h +++ b/sysdeps/powerpc/powerpc32/bits/atomic.h @@ -17,6 +17,8 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#include <tls.h> + /* POWER6 adds a "Mutex Hint" to the Load and Reserve instruction. This is a hint to the hardware to expect additional updates adjacent to the lock word or not. If we are acquiring a Mutex, the hint @@ -33,6 +35,14 @@ # define MUTEX_HINT_REL #endif +/* Check if the process has created a thread. */ +#ifndef NOT_IN_libc +# define SINGLE_THREAD_P \ + (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) +#else +# define SINGLE_THREAD_P 0 +#endif + /* * The 32-bit exchange_bool is different on powerpc64 because the subf * does signed 64-bit arithmetic while the lwarx is 32-bit unsigned @@ -42,7 +52,8 @@ #define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \ ({ \ unsigned int __tmp; \ - __asm __volatile ( \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ "1: lwarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ " subf. %0,%2,%0\n" \ " bne 2f\n" \ @@ -52,13 +63,20 @@ : "=&r" (__tmp) \ : "b" (mem), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = !(*mem == oldval); \ + if (!__tmp) \ + *mem = newval; \ + } \ __tmp != 0; \ }) #define __arch_compare_and_exchange_bool_32_rel(mem, newval, oldval) \ ({ \ unsigned int __tmp; \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: lwarx %0,0,%1" MUTEX_HINT_REL "\n" \ " subf. %0,%2,%0\n" \ " bne 2f\n" \ @@ -68,6 +86,12 @@ : "=&r" (__tmp) \ : "b" (mem), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = !(*mem == oldval); \ + if (!__tmp) \ + *mem = newval; \ + } \ __tmp != 0; \ }) diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h index 527fe7c..0e2fe98 100644 --- a/sysdeps/powerpc/powerpc64/bits/atomic.h +++ b/sysdeps/powerpc/powerpc64/bits/atomic.h @@ -17,6 +17,8 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#include <tls.h> + /* POWER6 adds a "Mutex Hint" to the Load and Reserve instruction. This is a hint to the hardware to expect additional updates adjacent to the lock word or not. If we are acquiring a Mutex, the hint @@ -33,6 +35,15 @@ # define MUTEX_HINT_REL #endif +/* Check if the process has created a thread. The lock optimization is only + for locks within libc.so. */ +#ifndef NOT_IN_libc +# define SINGLE_THREAD_P \ + (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0) +#else +# define SINGLE_THREAD_P 0 +#endif + /* The 32-bit exchange_bool is different on powerpc64 because the subf does signed 64-bit arithmetic while the lwarx is 32-bit unsigned (a load word and zero (high 32) form) load. @@ -42,7 +53,8 @@ #define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \ ({ \ unsigned int __tmp, __tmp2; \ - __asm __volatile (" clrldi %1,%1,32\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (" clrldi %1,%1,32\n" \ "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ " subf. %0,%1,%0\n" \ " bne 2f\n" \ @@ -52,13 +64,20 @@ : "=&r" (__tmp), "=r" (__tmp2) \ : "b" (mem), "1" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = !(*mem == oldval); \ + if (!__tmp) \ + *mem = newval; \ + } \ __tmp != 0; \ }) #define __arch_compare_and_exchange_bool_32_rel(mem, newval, oldval) \ ({ \ unsigned int __tmp, __tmp2; \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ " clrldi %1,%1,32\n" \ "1: lwarx %0,0,%2" MUTEX_HINT_REL "\n" \ " subf. %0,%1,%0\n" \ @@ -69,6 +88,12 @@ : "=&r" (__tmp), "=r" (__tmp2) \ : "b" (mem), "1" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = !(*mem == oldval); \ + if (!__tmp) \ + *mem = newval; \ + } \ __tmp != 0; \ }) @@ -80,7 +105,8 @@ #define __arch_compare_and_exchange_bool_64_acq(mem, newval, oldval) \ ({ \ unsigned long __tmp; \ - __asm __volatile ( \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ "1: ldarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ " subf. %0,%2,%0\n" \ " bne 2f\n" \ @@ -90,13 +116,20 @@ : "=&r" (__tmp) \ : "b" (mem), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = !(*mem == oldval); \ + if (!__tmp) \ + *mem = newval; \ + } \ __tmp != 0; \ }) #define __arch_compare_and_exchange_bool_64_rel(mem, newval, oldval) \ ({ \ unsigned long __tmp; \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \ " subf. %0,%2,%0\n" \ " bne 2f\n" \ @@ -106,6 +139,12 @@ : "=&r" (__tmp) \ : "b" (mem), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = !(*mem == oldval); \ + if (!__tmp) \ + *mem = newval; \ + } \ __tmp != 0; \ }) @@ -113,7 +152,8 @@ ({ \ __typeof (*(mem)) __tmp; \ __typeof (mem) __memp = (mem); \ - __asm __volatile ( \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ( \ "1: ldarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ " cmpd %0,%2\n" \ " bne 2f\n" \ @@ -123,6 +163,12 @@ : "=&r" (__tmp) \ : "b" (__memp), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = *__memp; \ + if (__tmp == oldval) \ + *__memp = newval; \ + } \ __tmp; \ }) @@ -130,7 +176,8 @@ ({ \ __typeof (*(mem)) __tmp; \ __typeof (mem) __memp = (mem); \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: ldarx %0,0,%1" MUTEX_HINT_REL "\n" \ " cmpd %0,%2\n" \ " bne 2f\n" \ @@ -140,13 +187,20 @@ : "=&r" (__tmp) \ : "b" (__memp), "r" (oldval), "r" (newval) \ : "cr0", "memory"); \ + else \ + { \ + __tmp = *__memp; \ + if (__tmp == oldval) \ + *__memp = newval; \ + } \ __tmp; \ }) #define __arch_atomic_exchange_64_acq(mem, value) \ ({ \ __typeof (*mem) __val; \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: ldarx %0,0,%2" MUTEX_HINT_ACQ "\n" \ " stdcx. %3,0,%2\n" \ " bne- 1b\n" \ @@ -154,64 +208,88 @@ : "=&r" (__val), "=m" (*mem) \ : "b" (mem), "r" (value), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = *mem; \ + *mem = value; \ + } \ __val; \ }) #define __arch_atomic_exchange_64_rel(mem, value) \ ({ \ __typeof (*mem) __val; \ - __asm __volatile (__ARCH_REL_INSTR "\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \ " stdcx. %3,0,%2\n" \ " bne- 1b" \ : "=&r" (__val), "=m" (*mem) \ : "b" (mem), "r" (value), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = *mem; \ + *mem = value; \ + } \ __val; \ }) #define __arch_atomic_exchange_and_add_64(mem, value) \ ({ \ __typeof (*mem) __val, __tmp; \ - __asm __volatile ("1: ldarx %0,0,%3\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ("1: ldarx %0,0,%3\n" \ " add %1,%0,%4\n" \ " stdcx. %1,0,%3\n" \ " bne- 1b" \ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ : "b" (mem), "r" (value), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = *mem; \ + *mem += value; \ + } \ __val; \ }) #define __arch_atomic_increment_val_64(mem) \ ({ \ __typeof (*(mem)) __val; \ - __asm __volatile ("1: ldarx %0,0,%2\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ("1: ldarx %0,0,%2\n" \ " addi %0,%0,1\n" \ " stdcx. %0,0,%2\n" \ " bne- 1b" \ : "=&b" (__val), "=m" (*mem) \ : "b" (mem), "m" (*mem) \ : "cr0", "memory"); \ + else \ + __val = ++(*mem); \ __val; \ }) #define __arch_atomic_decrement_val_64(mem) \ ({ \ __typeof (*(mem)) __val; \ - __asm __volatile ("1: ldarx %0,0,%2\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ("1: ldarx %0,0,%2\n" \ " subi %0,%0,1\n" \ " stdcx. %0,0,%2\n" \ " bne- 1b" \ : "=&b" (__val), "=m" (*mem) \ : "b" (mem), "m" (*mem) \ : "cr0", "memory"); \ + else \ + __val = --(*mem); \ __val; \ }) #define __arch_atomic_decrement_if_positive_64(mem) \ ({ int __val, __tmp; \ - __asm __volatile ("1: ldarx %0,0,%3\n" \ + if (!SINGLE_THREAD_P) \ + __asm __volatile ("1: ldarx %0,0,%3\n" \ " cmpdi 0,%0,0\n" \ " addi %1,%0,-1\n" \ " ble 2f\n" \ @@ -221,6 +299,12 @@ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ : "b" (mem), "m" (*mem) \ : "cr0", "memory"); \ + else \ + { \ + __val = (*mem); \ + if (__val > 0) \ + --(*mem); \ + } \ __val; \ })