Message ID | 1436271477-16941-1-git-send-email-sebastian.huber@embedded-brains.de |
---|---|
State | New |
Headers | show |
Ping. On 07/07/15 14:17, Sebastian Huber wrote: > Try to re-use the previous team to avoid the use of malloc() and free() > in the normal case where number of threads is the same. Avoid > superfluous destruction and initialization of team synchronization > objects. > > Using the microbenchmark posted here > > https://gcc.gnu.org/ml/gcc-patches/2008-03/msg00930.html > > shows an improvement in the parallel bench test case (target > x86_64-unknown-linux-gnu, median out of 9 test runs, iteration count > increased to 200000). > > Before the patch: > > parallel bench 11.2284 seconds > > After the patch: > > parallel bench 10.7575 seconds > > libgomp/ChangeLog > 2015-07-07 Sebastian Huber <sebastian.huber@embedded-brains.de> > > * libgomp.h (gomp_thread): Add spare_team field. > * team.c (gomp_thread_start): Initialize spare team for non-TLS > targets. > (gomp_new_team): Use spare team if possible. > (free_team): Destroy more team objects. > (gomp_free_thread): Free spare team if necessary. > (free_non_nested_team): New. > (gomp_team_end): Move some team object destructions to > free_team(). Use free_non_nested_team(). > --- > libgomp/libgomp.h | 3 +++ > libgomp/team.c | 63 ++++++++++++++++++++++++++++++++++++------------------- > 2 files changed, 45 insertions(+), 21 deletions(-) > > diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h > index 5ed0f78..563c1e2 100644 > --- a/libgomp/libgomp.h > +++ b/libgomp/libgomp.h > @@ -448,6 +448,9 @@ struct gomp_thread > > /* User pthread thread pool */ > struct gomp_thread_pool *thread_pool; > + > + /* Spare team ready for re-use in gomp_new_team() */ > + struct gomp_team *spare_team; > }; > > > diff --git a/libgomp/team.c b/libgomp/team.c > index b98b233..cc19eb0 100644 > --- a/libgomp/team.c > +++ b/libgomp/team.c > @@ -77,6 +77,7 @@ gomp_thread_start (void *xdata) > struct gomp_thread local_thr; > thr = &local_thr; > pthread_setspecific (gomp_tls_key, thr); > + thr->spare_team = NULL; > #endif > gomp_sem_init (&thr->release, 0); > > @@ -140,19 +141,35 @@ gomp_thread_start (void *xdata) > struct gomp_team * > gomp_new_team (unsigned nthreads) > { > + struct gomp_thread *thr = gomp_thread (); > + struct gomp_team *spare_team = thr->spare_team; > struct gomp_team *team; > - size_t size; > int i; > > - size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0]) > - + sizeof (team->implicit_task[0])); > - team = gomp_malloc (size); > + if (spare_team && spare_team->nthreads == nthreads) > + { > + thr->spare_team = NULL; > + team = spare_team; > + } > + else > + { > + size_t extra = sizeof (team->ordered_release[0]) > + + sizeof (team->implicit_task[0]); > + team = gomp_malloc (sizeof (*team) + nthreads * extra); > + > +#ifndef HAVE_SYNC_BUILTINS > + gomp_mutex_init (&team->work_share_list_free_lock); > +#endif > + gomp_barrier_init (&team->barrier, nthreads); > + gomp_sem_init (&team->master_release, 0); > + gomp_mutex_init (&team->task_lock); > + > + team->nthreads = nthreads; > + } > > team->work_share_chunk = 8; > #ifdef HAVE_SYNC_BUILTINS > team->single_count = 0; > -#else > - gomp_mutex_init (&team->work_share_list_free_lock); > #endif > team->work_shares_to_free = &team->work_shares[0]; > gomp_init_work_share (&team->work_shares[0], false, nthreads); > @@ -163,14 +180,9 @@ gomp_new_team (unsigned nthreads) > team->work_shares[i].next_free = &team->work_shares[i + 1]; > team->work_shares[i].next_free = NULL; > > - team->nthreads = nthreads; > - gomp_barrier_init (&team->barrier, nthreads); > - > - gomp_sem_init (&team->master_release, 0); > team->ordered_release = (void *) &team->implicit_task[nthreads]; > team->ordered_release[0] = &team->master_release; > > - gomp_mutex_init (&team->task_lock); > team->task_queue = NULL; > team->task_count = 0; > team->task_queued_count = 0; > @@ -187,6 +199,10 @@ gomp_new_team (unsigned nthreads) > static void > free_team (struct gomp_team *team) > { > + gomp_sem_destroy (&team->master_release); > +#ifndef HAVE_SYNC_BUILTINS > + gomp_mutex_destroy (&team->work_share_list_free_lock); > +#endif > gomp_barrier_destroy (&team->barrier); > gomp_mutex_destroy (&team->task_lock); > free (team); > @@ -225,6 +241,8 @@ gomp_free_thread (void *arg __attribute__((unused))) > { > struct gomp_thread *thr = gomp_thread (); > struct gomp_thread_pool *pool = thr->thread_pool; > + if (thr->spare_team) > + free_team (thr->spare_team); > if (pool) > { > if (pool->threads_used > 0) > @@ -835,6 +853,18 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, > free (affinity_thr); > } > > +static void > +free_non_nested_team (struct gomp_team *team, struct gomp_thread *thr) > +{ > + struct gomp_thread_pool *pool = thr->thread_pool; > + if (pool->last_team) > + { > + if (thr->spare_team) > + free_team (thr->spare_team); > + thr->spare_team = pool->last_team; > + } > + pool->last_team = team; > +} > > /* Terminate the current team. This is only to be called by the master > thread. We assume that we must wait for the other threads. */ > @@ -894,21 +924,12 @@ gomp_team_end (void) > } > while (ws != NULL); > } > - gomp_sem_destroy (&team->master_release); > -#ifndef HAVE_SYNC_BUILTINS > - gomp_mutex_destroy (&team->work_share_list_free_lock); > -#endif > > if (__builtin_expect (thr->ts.team != NULL, 0) > || __builtin_expect (team->nthreads == 1, 0)) > free_team (team); > else > - { > - struct gomp_thread_pool *pool = thr->thread_pool; > - if (pool->last_team) > - free_team (pool->last_team); > - pool->last_team = team; > - } > + free_non_nested_team (team, thr); > } > >
On Mon, Jul 13, 2015 at 08:32:33AM +0200, Sebastian Huber wrote:
> Ping.
Space in gomp_thread is precious, that is a TLS variable, and you want
to only handle the non-nested case only anyway, so why don't you
just try to use
if (thr->thread_pool)
{
struct gomp_thread *last_team = thr->thread_pool->last_team;
if (last_team && last_team->nthreads == nthreads)
{
team = last_team;
thr->thread_pool->last_team = NULL;
}
}
?
The move of mutex, barrier and sem inits/destroy is possible (on Linux
likely not measurable due to destroys being nops and inits being very cheap,
but on other OSes it might be).
Jakub
On 13/07/15 09:17, Jakub Jelinek wrote: > On Mon, Jul 13, 2015 at 08:32:33AM +0200, Sebastian Huber wrote: >> Ping. > Space in gomp_thread is precious, that is a TLS variable, and you want > to only handle the non-nested case only anyway, so why don't you > just try to use > if (thr->thread_pool) > { > struct gomp_thread *last_team = thr->thread_pool->last_team; > if (last_team && last_team->nthreads == nthreads) > { > team = last_team; > thr->thread_pool->last_team = NULL; > } > } > ? I started with this variant, but it needs one more if to check the thread pool. Since space seems to be more important, I will adjust the patch.
On Mon, Jul 13, 2015 at 09:26:10AM +0200, Sebastian Huber wrote: > > > On 13/07/15 09:17, Jakub Jelinek wrote: > >On Mon, Jul 13, 2015 at 08:32:33AM +0200, Sebastian Huber wrote: > >>Ping. > >Space in gomp_thread is precious, that is a TLS variable, and you want > >to only handle the non-nested case only anyway, so why don't you > >just try to use > >if (thr->thread_pool) > > { > > struct gomp_thread *last_team = thr->thread_pool->last_team; > > if (last_team && last_team->nthreads == nthreads) > > { > > team = last_team; > > thr->thread_pool->last_team = NULL; > > } > > } > >? > > I started with this variant, but it needs one more if to check the thread > pool. Since space seems to be more important, I will adjust the patch. I guess you can use if (__builtin_expect (thr->thread_pool != NULL, 1)) the first GOMP_parallel etc. in a thread is slower anyway. It also needs a check that thr->ts.team == NULL. Jakub
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 5ed0f78..563c1e2 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -448,6 +448,9 @@ struct gomp_thread /* User pthread thread pool */ struct gomp_thread_pool *thread_pool; + + /* Spare team ready for re-use in gomp_new_team() */ + struct gomp_team *spare_team; }; diff --git a/libgomp/team.c b/libgomp/team.c index b98b233..cc19eb0 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -77,6 +77,7 @@ gomp_thread_start (void *xdata) struct gomp_thread local_thr; thr = &local_thr; pthread_setspecific (gomp_tls_key, thr); + thr->spare_team = NULL; #endif gomp_sem_init (&thr->release, 0); @@ -140,19 +141,35 @@ gomp_thread_start (void *xdata) struct gomp_team * gomp_new_team (unsigned nthreads) { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *spare_team = thr->spare_team; struct gomp_team *team; - size_t size; int i; - size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0]) - + sizeof (team->implicit_task[0])); - team = gomp_malloc (size); + if (spare_team && spare_team->nthreads == nthreads) + { + thr->spare_team = NULL; + team = spare_team; + } + else + { + size_t extra = sizeof (team->ordered_release[0]) + + sizeof (team->implicit_task[0]); + team = gomp_malloc (sizeof (*team) + nthreads * extra); + +#ifndef HAVE_SYNC_BUILTINS + gomp_mutex_init (&team->work_share_list_free_lock); +#endif + gomp_barrier_init (&team->barrier, nthreads); + gomp_sem_init (&team->master_release, 0); + gomp_mutex_init (&team->task_lock); + + team->nthreads = nthreads; + } team->work_share_chunk = 8; #ifdef HAVE_SYNC_BUILTINS team->single_count = 0; -#else - gomp_mutex_init (&team->work_share_list_free_lock); #endif team->work_shares_to_free = &team->work_shares[0]; gomp_init_work_share (&team->work_shares[0], false, nthreads); @@ -163,14 +180,9 @@ gomp_new_team (unsigned nthreads) team->work_shares[i].next_free = &team->work_shares[i + 1]; team->work_shares[i].next_free = NULL; - team->nthreads = nthreads; - gomp_barrier_init (&team->barrier, nthreads); - - gomp_sem_init (&team->master_release, 0); team->ordered_release = (void *) &team->implicit_task[nthreads]; team->ordered_release[0] = &team->master_release; - gomp_mutex_init (&team->task_lock); team->task_queue = NULL; team->task_count = 0; team->task_queued_count = 0; @@ -187,6 +199,10 @@ gomp_new_team (unsigned nthreads) static void free_team (struct gomp_team *team) { + gomp_sem_destroy (&team->master_release); +#ifndef HAVE_SYNC_BUILTINS + gomp_mutex_destroy (&team->work_share_list_free_lock); +#endif gomp_barrier_destroy (&team->barrier); gomp_mutex_destroy (&team->task_lock); free (team); @@ -225,6 +241,8 @@ gomp_free_thread (void *arg __attribute__((unused))) { struct gomp_thread *thr = gomp_thread (); struct gomp_thread_pool *pool = thr->thread_pool; + if (thr->spare_team) + free_team (thr->spare_team); if (pool) { if (pool->threads_used > 0) @@ -835,6 +853,18 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, free (affinity_thr); } +static void +free_non_nested_team (struct gomp_team *team, struct gomp_thread *thr) +{ + struct gomp_thread_pool *pool = thr->thread_pool; + if (pool->last_team) + { + if (thr->spare_team) + free_team (thr->spare_team); + thr->spare_team = pool->last_team; + } + pool->last_team = team; +} /* Terminate the current team. This is only to be called by the master thread. We assume that we must wait for the other threads. */ @@ -894,21 +924,12 @@ gomp_team_end (void) } while (ws != NULL); } - gomp_sem_destroy (&team->master_release); -#ifndef HAVE_SYNC_BUILTINS - gomp_mutex_destroy (&team->work_share_list_free_lock); -#endif if (__builtin_expect (thr->ts.team != NULL, 0) || __builtin_expect (team->nthreads == 1, 0)) free_team (team); else - { - struct gomp_thread_pool *pool = thr->thread_pool; - if (pool->last_team) - free_team (pool->last_team); - pool->last_team = team; - } + free_non_nested_team (team, thr); }