diff mbox series

[v2] nptl: Add backoff mechanism to spinlock loop

Message ID 20220328084705.468207-1-wangyang.guo@intel.com
State New
Headers show
Series [v2] nptl: Add backoff mechanism to spinlock loop | expand

Commit Message

Guo, Wangyang March 28, 2022, 8:47 a.m. UTC
When mutiple threads waiting for lock at the same time, once lock owner
releases the lock, waiters will see lock available and all try to lock,
which may cause an expensive CAS storm.

Binary exponential backoff with random jitter is introduced. As try-lock
attempt increases, there is more likely that a larger number threads
compete for adaptive mutex lock, so increase wait time in exponential.
A random jitter is also added to avoid synchronous try-lock from other
threads.

v2: Remove read-check before try-lock for performance.

Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
---
 nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

Comments

Noah Goldstein March 28, 2022, 4:41 p.m. UTC | #1
On Mon, Mar 28, 2022 at 3:47 AM Wangyang Guo <wangyang.guo@intel.com> wrote:
>
> When mutiple threads waiting for lock at the same time, once lock owner
> releases the lock, waiters will see lock available and all try to lock,
> which may cause an expensive CAS storm.
>
> Binary exponential backoff with random jitter is introduced. As try-lock
> attempt increases, there is more likely that a larger number threads
> compete for adaptive mutex lock, so increase wait time in exponential.
> A random jitter is also added to avoid synchronous try-lock from other
> threads.
>
> v2: Remove read-check before try-lock for performance.
>
> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> ---
>  nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
>  1 file changed, 16 insertions(+), 9 deletions(-)
>
> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> index d2e652d151..7e75ec1cba 100644
> --- a/nptl/pthread_mutex_lock.c
> +++ b/nptl/pthread_mutex_lock.c
> @@ -26,6 +26,7 @@
>  #include <futex-internal.h>
>  #include <stap-probe.h>
>  #include <shlib-compat.h>
> +#include <random-bits.h>
>
>  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
>     includes this file.  */
> @@ -64,11 +65,6 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
>  # define PTHREAD_MUTEX_VERSIONS 1
>  #endif
>
> -#ifndef LLL_MUTEX_READ_LOCK
> -# define LLL_MUTEX_READ_LOCK(mutex) \
> -  atomic_load_relaxed (&(mutex)->__data.__lock)
> -#endif
> -
>  static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
>       __attribute_noinline__;
>
> @@ -138,17 +134,28 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
>           int cnt = 0;
>           int max_cnt = MIN (max_adaptive_count (),
>                              mutex->__data.__spins * 2 + 10);
> +         int spin_count, exp_backoff = 1;
> +         unsigned int jitter = random_bits ();
>           do
>             {
> -             if (cnt++ >= max_cnt)
> +             /* In each loop, spin count is exponential backoff plus
> +                random jitter, random range is [0, exp_backoff-1].  */
> +             spin_count = exp_backoff + (jitter & (exp_backoff - 1));
> +             cnt += spin_count;
> +             if (cnt >= max_cnt)
>                 {
> +                 /* If cnt exceeds max spin count, just go to wait
> +                    queue.  */
>                   LLL_MUTEX_LOCK (mutex);
>                   break;
>                 }
> -             atomic_spin_nop ();
> +             do
> +                 atomic_spin_nop ();
> +             while (--spin_count > 0);
> +             /* Binary exponential backoff, prepare for next loop.  */
> +             exp_backoff <<= 1;
>             }
> -         while (LLL_MUTEX_READ_LOCK (mutex) != 0
> -                || LLL_MUTEX_TRYLOCK (mutex) != 0);
> +         while (LLL_MUTEX_TRYLOCK (mutex) != 0);

Does it perform better w.o the read?

In general can you post some benchmarks varying the number of threads / size of
the critical section? Would also be nice if you could collect some
stats regarding
the average number of failed CAS attempts before/after.
>
>           mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
>         }
> --
> 2.35.1
>
develop--- via Libc-alpha March 30, 2022, 11:44 a.m. UTC | #2
On 3/29/2022 12:41 AM, Noah Goldstein via Libc-alpha wrote:
> On Mon, Mar 28, 2022 at 3:47 AM Wangyang Guo <wangyang.guo@intel.com> wrote:
>>
>> When mutiple threads waiting for lock at the same time, once lock owner
>> releases the lock, waiters will see lock available and all try to lock,
>> which may cause an expensive CAS storm.
>>
>> Binary exponential backoff with random jitter is introduced. As try-lock
>> attempt increases, there is more likely that a larger number threads
>> compete for adaptive mutex lock, so increase wait time in exponential.
>> A random jitter is also added to avoid synchronous try-lock from other
>> threads.
>>
>> v2: Remove read-check before try-lock for performance.
>>
>> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
>> ---
>>   nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
>>   1 file changed, 16 insertions(+), 9 deletions(-)
>>
>> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
>> index d2e652d151..7e75ec1cba 100644
>> --- a/nptl/pthread_mutex_lock.c
>> +++ b/nptl/pthread_mutex_lock.c
>> @@ -26,6 +26,7 @@
>>   #include <futex-internal.h>
>>   #include <stap-probe.h>
>>   #include <shlib-compat.h>
>> +#include <random-bits.h>
>>
>>   /* Some of the following definitions differ when pthread_mutex_cond_lock.c
>>      includes this file.  */
>> @@ -64,11 +65,6 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
>>   # define PTHREAD_MUTEX_VERSIONS 1
>>   #endif
>>
>> -#ifndef LLL_MUTEX_READ_LOCK
>> -# define LLL_MUTEX_READ_LOCK(mutex) \
>> -  atomic_load_relaxed (&(mutex)->__data.__lock)
>> -#endif
>> -
>>   static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
>>        __attribute_noinline__;
>>
>> @@ -138,17 +134,28 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
>>            int cnt = 0;
>>            int max_cnt = MIN (max_adaptive_count (),
>>                               mutex->__data.__spins * 2 + 10);
>> +         int spin_count, exp_backoff = 1;
>> +         unsigned int jitter = random_bits ();
>>            do
>>              {
>> -             if (cnt++ >= max_cnt)
>> +             /* In each loop, spin count is exponential backoff plus
>> +                random jitter, random range is [0, exp_backoff-1].  */
>> +             spin_count = exp_backoff + (jitter & (exp_backoff - 1));
>> +             cnt += spin_count;
>> +             if (cnt >= max_cnt)
>>                  {
>> +                 /* If cnt exceeds max spin count, just go to wait
>> +                    queue.  */
>>                    LLL_MUTEX_LOCK (mutex);
>>                    break;
>>                  }
>> -             atomic_spin_nop ();
>> +             do
>> +                 atomic_spin_nop ();
>> +             while (--spin_count > 0);
>> +             /* Binary exponential backoff, prepare for next loop.  */
>> +             exp_backoff <<= 1;
>>              }
>> -         while (LLL_MUTEX_READ_LOCK (mutex) != 0
>> -                || LLL_MUTEX_TRYLOCK (mutex) != 0);
>> +         while (LLL_MUTEX_TRYLOCK (mutex) != 0);
> 
> Does it perform better w.o the read?
> 
> In general can you post some benchmarks varying the number of threads / size of
> the critical section? Would also be nice if you could collect some
> stats regarding
> the average number of failed CAS attempts before/after.

I work out a mutex bench: 
https://gist.github.com/guowangy/459085e5be6bfeda101c591d2d2304c5
It can test with different threads and critical section (critical length 
determined by num of pause instructions).

Here is the result of v1 against upstream:
(Tested in Xeon 8380)

npause	1	4	16	64	128   <== thread num
--------------------------------------------
0	1.00	1.70	1.94	1.70	1.76
1	0.99	1.57	1.75	1.54	1.58
4	1.00	1.28	1.42	1.38	1.41
16	1.00	1.02	1.11	1.18	1.19
64	1.00	1.00	0.95	0.99	0.97
128	1.00	0.92	0.92	0.87	0.87

The first row header is number of threads.
The first column is num of pause, which represents different critical 
section sizes.
The value is the rate of v1/upstream throughput (High Is Better).
RSD varies by thread num: thread(4, 16) < 2%, thread(1, 64, 128) < 1%.

Thread=1 is the baseline and should be the same for the both.
Backoff works well when critical section is small.
But it cause some regression when critical section is very large,
because of upstream using frequently attempt to reduce the latency.
Since adaptive is aimed for 'quick' locks, I think it's not a big 
problem here.

The next is v2 against v1:
	1	4	16	64	128
--------------------------------------------
0	1.00	0.99	0.95	1.02	1.02
1	1.00	1.08	0.92	1.04	1.04
4	1.00	0.94	0.94	1.05	1.04
16	1.00	1.01	0.98	1.03	1.03
64	1.00	1.01	1.00	1.00	1.01
128	1.00	1.02	1.00	1.00	1.00

v2 without read-check can have benefit in large thread num, but have
drawback in small thread num.

I also have a version with random jitter removed (v3), here is the 
result of v3 against v1:
	1	4	16	64	128
--------------------------------------------
0	1.00	0.95	1.04	1.02	1.03
1	1.01	1.09	1.04	1.02	1.03
4	1.00	1.02	1.05	1.02	1.02
16	1.00	1.01	1.01	1.02	1.02
64	1.00	1.04	1.02	1.01	1.01
128	1.00	0.98	0.99	0.99	0.98

The result shows without random is a better solution.

>>
>>            mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
>>          }
>> --
>> 2.35.1
>>
>
Adhemerval Zanella Netto March 30, 2022, 11:53 a.m. UTC | #3
On 28/03/2022 05:47, Wangyang Guo via Libc-alpha wrote:
> When mutiple threads waiting for lock at the same time, once lock owner
> releases the lock, waiters will see lock available and all try to lock,
> which may cause an expensive CAS storm.
> 
> Binary exponential backoff with random jitter is introduced. As try-lock
> attempt increases, there is more likely that a larger number threads
> compete for adaptive mutex lock, so increase wait time in exponential.
> A random jitter is also added to avoid synchronous try-lock from other
> threads.
> 
> v2: Remove read-check before try-lock for performance.
> 
> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> ---
>  nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
>  1 file changed, 16 insertions(+), 9 deletions(-)
> 
> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> index d2e652d151..7e75ec1cba 100644
> --- a/nptl/pthread_mutex_lock.c
> +++ b/nptl/pthread_mutex_lock.c
> @@ -26,6 +26,7 @@
>  #include <futex-internal.h>
>  #include <stap-probe.h>
>  #include <shlib-compat.h>
> +#include <random-bits.h>
>  
>  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
>     includes this file.  */
> @@ -64,11 +65,6 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
>  # define PTHREAD_MUTEX_VERSIONS 1
>  #endif
>  
> -#ifndef LLL_MUTEX_READ_LOCK
> -# define LLL_MUTEX_READ_LOCK(mutex) \
> -  atomic_load_relaxed (&(mutex)->__data.__lock)
> -#endif
> -
>  static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
>       __attribute_noinline__;
>  
> @@ -138,17 +134,28 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
>  	  int cnt = 0;
>  	  int max_cnt = MIN (max_adaptive_count (),
>  			     mutex->__data.__spins * 2 + 10);
> +	  int spin_count, exp_backoff = 1;
> +	  unsigned int jitter = random_bits ();

This will issue a syscall for architectures that do not have clock_gettime
on vDSO, which is a performance regression.  You will need to move the
jitter setup to be arch-specific, where the generic interface setting
no random jitter.

>  	  do
>  	    {
> -	      if (cnt++ >= max_cnt)
> +	      /* In each loop, spin count is exponential backoff plus
> +	         random jitter, random range is [0, exp_backoff-1].  */
> +	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
> +	      cnt += spin_count;
> +	      if (cnt >= max_cnt)
>  		{
> +		  /* If cnt exceeds max spin count, just go to wait
> +		     queue.  */
>  		  LLL_MUTEX_LOCK (mutex);
>  		  break;
>  		}
> -	      atomic_spin_nop ();
> +	      do
> +		  atomic_spin_nop ();
> +	      while (--spin_count > 0);
> +	      /* Binary exponential backoff, prepare for next loop.  */
> +	      exp_backoff <<= 1;
>  	    }
> -	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
> -		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
> +	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
>  
>  	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
>  	}
Noah Goldstein March 30, 2022, 5:07 p.m. UTC | #4
On Wed, Mar 30, 2022 at 6:54 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 28/03/2022 05:47, Wangyang Guo via Libc-alpha wrote:
> > When mutiple threads waiting for lock at the same time, once lock owner
> > releases the lock, waiters will see lock available and all try to lock,
> > which may cause an expensive CAS storm.
> >
> > Binary exponential backoff with random jitter is introduced. As try-lock
> > attempt increases, there is more likely that a larger number threads
> > compete for adaptive mutex lock, so increase wait time in exponential.
> > A random jitter is also added to avoid synchronous try-lock from other
> > threads.
> >
> > v2: Remove read-check before try-lock for performance.
> >
> > Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> > ---
> >  nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
> >  1 file changed, 16 insertions(+), 9 deletions(-)
> >
> > diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> > index d2e652d151..7e75ec1cba 100644
> > --- a/nptl/pthread_mutex_lock.c
> > +++ b/nptl/pthread_mutex_lock.c
> > @@ -26,6 +26,7 @@
> >  #include <futex-internal.h>
> >  #include <stap-probe.h>
> >  #include <shlib-compat.h>
> > +#include <random-bits.h>
> >
> >  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
> >     includes this file.  */
> > @@ -64,11 +65,6 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> >  # define PTHREAD_MUTEX_VERSIONS 1
> >  #endif
> >
> > -#ifndef LLL_MUTEX_READ_LOCK
> > -# define LLL_MUTEX_READ_LOCK(mutex) \
> > -  atomic_load_relaxed (&(mutex)->__data.__lock)
> > -#endif
> > -
> >  static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
> >       __attribute_noinline__;
> >
> > @@ -138,17 +134,28 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
> >         int cnt = 0;
> >         int max_cnt = MIN (max_adaptive_count (),
> >                            mutex->__data.__spins * 2 + 10);
> > +       int spin_count, exp_backoff = 1;
> > +       unsigned int jitter = random_bits ();
>
> This will issue a syscall for architectures that do not have clock_gettime
> on vDSO, which is a performance regression.  You will need to move the
> jitter setup to be arch-specific, where the generic interface setting
> no random jitter.

What would be the best init jitter for arch w/ only syscall timers? TID? Or
something else?
>
> >         do
> >           {
> > -           if (cnt++ >= max_cnt)
> > +           /* In each loop, spin count is exponential backoff plus
> > +              random jitter, random range is [0, exp_backoff-1].  */
> > +           spin_count = exp_backoff + (jitter & (exp_backoff - 1));
> > +           cnt += spin_count;
> > +           if (cnt >= max_cnt)
> >               {
> > +               /* If cnt exceeds max spin count, just go to wait
> > +                  queue.  */
> >                 LLL_MUTEX_LOCK (mutex);
> >                 break;
> >               }
> > -           atomic_spin_nop ();
> > +           do
> > +               atomic_spin_nop ();
> > +           while (--spin_count > 0);
> > +           /* Binary exponential backoff, prepare for next loop.  */
> > +           exp_backoff <<= 1;
> >           }
> > -       while (LLL_MUTEX_READ_LOCK (mutex) != 0
> > -              || LLL_MUTEX_TRYLOCK (mutex) != 0);
> > +       while (LLL_MUTEX_TRYLOCK (mutex) != 0);
> >
> >         mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
> >       }
Adhemerval Zanella Netto March 30, 2022, 5:21 p.m. UTC | #5
On 30/03/2022 14:07, Noah Goldstein wrote:
> On Wed, Mar 30, 2022 at 6:54 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>>
>> On 28/03/2022 05:47, Wangyang Guo via Libc-alpha wrote:
>>> When mutiple threads waiting for lock at the same time, once lock owner
>>> releases the lock, waiters will see lock available and all try to lock,
>>> which may cause an expensive CAS storm.
>>>
>>> Binary exponential backoff with random jitter is introduced. As try-lock
>>> attempt increases, there is more likely that a larger number threads
>>> compete for adaptive mutex lock, so increase wait time in exponential.
>>> A random jitter is also added to avoid synchronous try-lock from other
>>> threads.
>>>
>>> v2: Remove read-check before try-lock for performance.
>>>
>>> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
>>> ---
>>>  nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
>>>  1 file changed, 16 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
>>> index d2e652d151..7e75ec1cba 100644
>>> --- a/nptl/pthread_mutex_lock.c
>>> +++ b/nptl/pthread_mutex_lock.c
>>> @@ -26,6 +26,7 @@
>>>  #include <futex-internal.h>
>>>  #include <stap-probe.h>
>>>  #include <shlib-compat.h>
>>> +#include <random-bits.h>
>>>
>>>  /* Some of the following definitions differ when pthread_mutex_cond_lock.c
>>>     includes this file.  */
>>> @@ -64,11 +65,6 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
>>>  # define PTHREAD_MUTEX_VERSIONS 1
>>>  #endif
>>>
>>> -#ifndef LLL_MUTEX_READ_LOCK
>>> -# define LLL_MUTEX_READ_LOCK(mutex) \
>>> -  atomic_load_relaxed (&(mutex)->__data.__lock)
>>> -#endif
>>> -
>>>  static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
>>>       __attribute_noinline__;
>>>
>>> @@ -138,17 +134,28 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
>>>         int cnt = 0;
>>>         int max_cnt = MIN (max_adaptive_count (),
>>>                            mutex->__data.__spins * 2 + 10);
>>> +       int spin_count, exp_backoff = 1;
>>> +       unsigned int jitter = random_bits ();
>>
>> This will issue a syscall for architectures that do not have clock_gettime
>> on vDSO, which is a performance regression.  You will need to move the
>> jitter setup to be arch-specific, where the generic interface setting
>> no random jitter.
> 
> What would be the best init jitter for arch w/ only syscall timers? TID? Or
> something else?

We don't cached the TID anymore, so it would still generate syscalls. I think
it safer that since we don't have a internal source of entropy (Florian send
some time ago a arc4random implementation that we could use in this case)
the generic interface would do any jitter at all (so it would be basically
the algorithm we already have).
Noah Goldstein March 30, 2022, 7:39 p.m. UTC | #6
On Wed, Mar 30, 2022 at 6:45 AM Guo, Wangyang <wangyang.guo@intel.com> wrote:
>
> On 3/29/2022 12:41 AM, Noah Goldstein via Libc-alpha wrote:
> > On Mon, Mar 28, 2022 at 3:47 AM Wangyang Guo <wangyang.guo@intel.com> wrote:
> >>
> >> When mutiple threads waiting for lock at the same time, once lock owner
> >> releases the lock, waiters will see lock available and all try to lock,
> >> which may cause an expensive CAS storm.
> >>
> >> Binary exponential backoff with random jitter is introduced. As try-lock
> >> attempt increases, there is more likely that a larger number threads
> >> compete for adaptive mutex lock, so increase wait time in exponential.
> >> A random jitter is also added to avoid synchronous try-lock from other
> >> threads.
> >>
> >> v2: Remove read-check before try-lock for performance.
> >>
> >> Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
> >> ---
> >>   nptl/pthread_mutex_lock.c | 25 ++++++++++++++++---------
> >>   1 file changed, 16 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
> >> index d2e652d151..7e75ec1cba 100644
> >> --- a/nptl/pthread_mutex_lock.c
> >> +++ b/nptl/pthread_mutex_lock.c
> >> @@ -26,6 +26,7 @@
> >>   #include <futex-internal.h>
> >>   #include <stap-probe.h>
> >>   #include <shlib-compat.h>
> >> +#include <random-bits.h>
> >>
> >>   /* Some of the following definitions differ when pthread_mutex_cond_lock.c
> >>      includes this file.  */
> >> @@ -64,11 +65,6 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
> >>   # define PTHREAD_MUTEX_VERSIONS 1
> >>   #endif
> >>
> >> -#ifndef LLL_MUTEX_READ_LOCK
> >> -# define LLL_MUTEX_READ_LOCK(mutex) \
> >> -  atomic_load_relaxed (&(mutex)->__data.__lock)
> >> -#endif
> >> -
> >>   static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
> >>        __attribute_noinline__;
> >>
> >> @@ -138,17 +134,28 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
> >>            int cnt = 0;
> >>            int max_cnt = MIN (max_adaptive_count (),
> >>                               mutex->__data.__spins * 2 + 10);
> >> +         int spin_count, exp_backoff = 1;
> >> +         unsigned int jitter = random_bits ();
> >>            do
> >>              {
> >> -             if (cnt++ >= max_cnt)
> >> +             /* In each loop, spin count is exponential backoff plus
> >> +                random jitter, random range is [0, exp_backoff-1].  */
> >> +             spin_count = exp_backoff + (jitter & (exp_backoff - 1));
> >> +             cnt += spin_count;
> >> +             if (cnt >= max_cnt)
> >>                  {
> >> +                 /* If cnt exceeds max spin count, just go to wait
> >> +                    queue.  */
> >>                    LLL_MUTEX_LOCK (mutex);
> >>                    break;
> >>                  }
> >> -             atomic_spin_nop ();
> >> +             do
> >> +                 atomic_spin_nop ();
> >> +             while (--spin_count > 0);
> >> +             /* Binary exponential backoff, prepare for next loop.  */
> >> +             exp_backoff <<= 1;
> >>              }
> >> -         while (LLL_MUTEX_READ_LOCK (mutex) != 0
> >> -                || LLL_MUTEX_TRYLOCK (mutex) != 0);
> >> +         while (LLL_MUTEX_TRYLOCK (mutex) != 0);
> >
> > Does it perform better w.o the read?
> >
> > In general can you post some benchmarks varying the number of threads / size of
> > the critical section? Would also be nice if you could collect some
> > stats regarding
> > the average number of failed CAS attempts before/after.
>
> I work out a mutex bench:
> https://gist.github.com/guowangy/459085e5be6bfeda101c591d2d2304c5
> It can test with different threads and critical section (critical length
> determined by num of pause instructions).

Can you add a benchmark to glibc? Also you are compiling w.o
optimizations on.

>
> Here is the result of v1 against upstream:
> (Tested in Xeon 8380)
>
> npause  1       4       16      64      128   <== thread num
> --------------------------------------------
> 0       1.00    1.70    1.94    1.70    1.76
> 1       0.99    1.57    1.75    1.54    1.58
> 4       1.00    1.28    1.42    1.38    1.41
> 16      1.00    1.02    1.11    1.18    1.19
> 64      1.00    1.00    0.95    0.99    0.97
> 128     1.00    0.92    0.92    0.87    0.87

I was able to essentially reproduce this.

Maybe in light of these changes `max_cnt` needs to a new threshold.
Also maybe instead of `cnt += spin_count` it should stay `cnt++` so we don't
hit the futex so eagerly.


>
> The first row header is number of threads.
> The first column is num of pause, which represents different critical
> section sizes.
> The value is the rate of v1/upstream throughput (High Is Better).
> RSD varies by thread num: thread(4, 16) < 2%, thread(1, 64, 128) < 1%.
>
> Thread=1 is the baseline and should be the same for the both.
> Backoff works well when critical section is small.
> But it cause some regression when critical section is very large,
> because of upstream using frequently attempt to reduce the latency.
> Since adaptive is aimed for 'quick' locks, I think it's not a big
> problem here.
>
> The next is v2 against v1:
>         1       4       16      64      128
> --------------------------------------------
> 0       1.00    0.99    0.95    1.02    1.02
> 1       1.00    1.08    0.92    1.04    1.04
> 4       1.00    0.94    0.94    1.05    1.04
> 16      1.00    1.01    0.98    1.03    1.03
> 64      1.00    1.01    1.00    1.00    1.01
> 128     1.00    1.02    1.00    1.00    1.00
>
> v2 without read-check can have benefit in large thread num, but have
> drawback in small thread num.
>
I am seeing a pretty significant benefit in keeping the read check
on my Tigerlake (only measuring up to 16 threads as its an 8-core
machine).

W/ read check:
[pause:   0, thread:   1] -- Throughput:   76528393 ops/s RSD: 4.55%
[pause:   0, thread:   4] -- Throughput:   28065221 ops/s RSD: 1.58%
[pause:   0, thread:   8] -- Throughput:   14757408 ops/s RSD: 1.86%
[pause:   0, thread:  16] -- Throughput:   15501579 ops/s RSD: 1.06%

w/o read check:
[pause:   0, thread:   1] -- Throughput:   71303250 ops/s RSD: 4.98%
[pause:   0, thread:   4] -- Throughput:   11308910 ops/s RSD: 8.96%
[pause:   0, thread:   8] -- Throughput:    7380217 ops/s RSD: 5.52%
[pause:   0, thread:  16] -- Throughput:    8253073 ops/s RSD: 2.19%


> I also have a version with random jitter removed (v3), here is the
> result of v3 against v1:
>         1       4       16      64      128
> --------------------------------------------
> 0       1.00    0.95    1.04    1.02    1.03
> 1       1.01    1.09    1.04    1.02    1.03
> 4       1.00    1.02    1.05    1.02    1.02
> 16      1.00    1.01    1.01    1.02    1.02
> 64      1.00    1.04    1.02    1.01    1.01
> 128     1.00    0.98    0.99    0.99    0.98
>
> The result shows without random is a better solution.
>
> >>
> >>            mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
> >>          }
> >> --
> >> 2.35.1
> >>
> >
>
Florian Weimer April 12, 2022, 11:53 a.m. UTC | #7
* Adhemerval Zanella via Libc-alpha:

> On 30/03/2022 14:07, Noah Goldstein wrote:
>> What would be the best init jitter for arch w/ only syscall timers? TID? Or
>> something else?
>
> We don't cached the TID anymore, so it would still generate syscalls.

We do have the TID for the running process in the TCB.  We need it (or
something like it) for recursive locks.

Maybe for this application, we should use some number of upper bits from
a per-thread linear consequential generator, initialized to a random
value at thread start?

Thanks,
Florian
Yann Droneaud April 22, 2022, 1:30 p.m. UTC | #8
Le 12/04/2022 à 13:53, Florian Weimer via Libc-alpha a écrit :
> * Adhemerval Zanella via Libc-alpha:
>
>> On 30/03/2022 14:07, Noah Goldstein wrote:
>>> What would be the best init jitter for arch w/ only syscall timers? TID? Or
>>> something else?
>> We don't cached the TID anymore, so it would still generate syscalls.
> We do have the TID for the running process in the TCB.  We need it (or
> something like it) for recursive locks.
>
> Maybe for this application, we should use some number of upper bits from
> a per-thread linear consequential generator, initialized to a random
> value at thread start?


As each running threads has its own stack, thread' stack address can be 
used as a seed for such PRNG.


Regards.
Florian Weimer April 22, 2022, 1:32 p.m. UTC | #9
* Yann Droneaud:

> Le 12/04/2022 à 13:53, Florian Weimer via Libc-alpha a écrit :
>> * Adhemerval Zanella via Libc-alpha:
>>
>>> On 30/03/2022 14:07, Noah Goldstein wrote:
>>>> What would be the best init jitter for arch w/ only syscall timers? TID? Or
>>>> something else?
>>> We don't cached the TID anymore, so it would still generate syscalls.
>> We do have the TID for the running process in the TCB.  We need it (or
>> something like it) for recursive locks.
>>
>> Maybe for this application, we should use some number of upper bits from
>> a per-thread linear consequential generator, initialized to a random
>> value at thread start?

> As each running threads has its own stack, thread' stack address can
> be used as a seed for such PRNG.

We would broadcast the stack address though, which is generally fround
upon.

Thanks,
Florian
Cristian Rodríguez April 22, 2022, 1:35 p.m. UTC | #10
On Fri, Apr 22, 2022 at 9:32 AM Florian Weimer via Libc-alpha
<libc-alpha@sourceware.org> wrote:

> > As each running threads has its own stack, thread' stack address can
> > be used as a seed for such PRNG.
>
> We would broadcast the stack address though, which is generally fround
> upon.
i>

You can use the siphash approach suggested by Jason if there is a
concern of leaking that information.
Noah Goldstein April 22, 2022, 3:25 p.m. UTC | #11
On Fri, Apr 22, 2022 at 8:35 AM Cristian Rodríguez via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Apr 22, 2022 at 9:32 AM Florian Weimer via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>
> > > As each running threads has its own stack, thread' stack address can
> > > be used as a seed for such PRNG.
> >
> > We would broadcast the stack address though, which is generally fround
> > upon.

Why is that?

Also next version is likely to be based on:
https://patchwork.sourceware.org/project/glibc/patch/20220404213942.652799-1-goldstein.w.n@gmail.com/

But just grabbing the stack sounds faster.

> i>
>
> You can use the siphash approach suggested by Jason if there is a
> concern of leaking that information.
Florian Weimer April 26, 2022, 12:25 p.m. UTC | #12
* Noah Goldstein:

> On Fri, Apr 22, 2022 at 8:35 AM Cristian Rodríguez via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> On Fri, Apr 22, 2022 at 9:32 AM Florian Weimer via Libc-alpha
>> <libc-alpha@sourceware.org> wrote:
>>
>> > > As each running threads has its own stack, thread' stack address can
>> > > be used as a seed for such PRNG.
>> >
>> > We would broadcast the stack address though, which is generally fround
>> > upon.
>
> Why is that?

Potential bypass of ASLR hardening.

Thanks,
Florian
Yann Droneaud April 26, 2022, 12:42 p.m. UTC | #13
Hi,

Le 26/04/2022 à 14:25, Florian Weimer a écrit :
> * Noah Goldstein:
>
>> On Fri, Apr 22, 2022 at 8:35 AM Cristian Rodríguez via Libc-alpha
>> <libc-alpha@sourceware.org> wrote:
>>> On Fri, Apr 22, 2022 at 9:32 AM Florian Weimer via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>
>>>>> As each running threads has its own stack, thread' stack address can
>>>>> be used as a seed for such PRNG.
>>>> We would broadcast the stack address though, which is generally fround
>>>> upon.
>> Why is that?
> Potential bypass of ASLR hardening.

The attack would be to monitor the behavior of multiple threads in a 
process contending for a lock, and get precise timings to recover the 
few bits of each thread stack address that leaked as part of the backoff 
mechanism.

It may sound possible, but I find it unlikely possible without full 
compromise of the running process ....

That said, using the stack address as the key in SipHash, it's 
cryptographically unlikely to recover it from its output, thus no 
exploitable leakage would happen.

Regards.
diff mbox series

Patch

diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index d2e652d151..7e75ec1cba 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -26,6 +26,7 @@ 
 #include <futex-internal.h>
 #include <stap-probe.h>
 #include <shlib-compat.h>
+#include <random-bits.h>
 
 /* Some of the following definitions differ when pthread_mutex_cond_lock.c
    includes this file.  */
@@ -64,11 +65,6 @@  lll_mutex_lock_optimized (pthread_mutex_t *mutex)
 # define PTHREAD_MUTEX_VERSIONS 1
 #endif
 
-#ifndef LLL_MUTEX_READ_LOCK
-# define LLL_MUTEX_READ_LOCK(mutex) \
-  atomic_load_relaxed (&(mutex)->__data.__lock)
-#endif
-
 static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
      __attribute_noinline__;
 
@@ -138,17 +134,28 @@  PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
 	  int cnt = 0;
 	  int max_cnt = MIN (max_adaptive_count (),
 			     mutex->__data.__spins * 2 + 10);
+	  int spin_count, exp_backoff = 1;
+	  unsigned int jitter = random_bits ();
 	  do
 	    {
-	      if (cnt++ >= max_cnt)
+	      /* In each loop, spin count is exponential backoff plus
+	         random jitter, random range is [0, exp_backoff-1].  */
+	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
+	      cnt += spin_count;
+	      if (cnt >= max_cnt)
 		{
+		  /* If cnt exceeds max spin count, just go to wait
+		     queue.  */
 		  LLL_MUTEX_LOCK (mutex);
 		  break;
 		}
-	      atomic_spin_nop ();
+	      do
+		  atomic_spin_nop ();
+	      while (--spin_count > 0);
+	      /* Binary exponential backoff, prepare for next loop.  */
+	      exp_backoff <<= 1;
 	    }
-	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
-		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
 
 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
 	}