Message ID | 20111128140252.GK7827@bubble.grove.modra.org |
---|---|
State | New |
Headers | show |
On 11/28/2011 06:02 AM, Alan Modra wrote: > - unsigned int ret = bar->generation & ~3; > - /* Do we need any barrier here or is __sync_add_and_fetch acting > - as the needed LoadLoad barrier already? */ > - ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; > + unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3; > + ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; Given that the read from bar->generation is ACQ, we don't need a duplicate barrier from the REL on the atomic add. I believe both can be MEMMODEL_ACQUIRE both in order to force the ordering of these two memops, as well as force these to happen before anything subsequent. The s/_4/_n/ change needs doing. Otherwise ok. r~
On Mon, Nov 28, 2011 at 05:42:15PM -0800, Richard Henderson wrote: > On 11/28/2011 06:02 AM, Alan Modra wrote: > > - unsigned int ret = bar->generation & ~3; > > - /* Do we need any barrier here or is __sync_add_and_fetch acting > > - as the needed LoadLoad barrier already? */ > > - ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; > > + unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3; > > + ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; > > Given that the read from bar->generation is ACQ, we don't need a duplicate > barrier from the REL on the atomic add. I believe both can be MEMMODEL_ACQUIRE > both in order to force the ordering of these two memops, as well as force these > to happen before anything subsequent. I tried with MEMMODEL_ACQUIRE and ran force-parallel-6.exe, the test that seems most sensitive to barrier problems, many times, and it hangs occasionally in futex_wait called via gomp_team_barrier_wait_end. I believe that threads can't be allowed to exit from gomp_{,team_}barrier_wait without hitting a release barrier, and perhaps from gomp_barrier_wait_last too. gomp_barrier_wait_start is a convenient point to insert the barrier, and a minimal change from the old code using __sync_add_and_fetch. I can add a comment. ;-)
On Tue, Nov 29, 2011 at 04:15:36PM +1030, Alan Modra wrote: > On Mon, Nov 28, 2011 at 05:42:15PM -0800, Richard Henderson wrote: > > On 11/28/2011 06:02 AM, Alan Modra wrote: > > > - unsigned int ret = bar->generation & ~3; > > > - /* Do we need any barrier here or is __sync_add_and_fetch acting > > > - as the needed LoadLoad barrier already? */ > > > - ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; > > > + unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3; > > > + ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; > > > > Given that the read from bar->generation is ACQ, we don't need a duplicate > > barrier from the REL on the atomic add. I believe both can be MEMMODEL_ACQUIRE > > both in order to force the ordering of these two memops, as well as force these > > to happen before anything subsequent. > > I tried with MEMMODEL_ACQUIRE and ran force-parallel-6.exe, the test > that seems most sensitive to barrier problems, many times, and it hangs > occasionally in futex_wait called via gomp_team_barrier_wait_end. > > I believe that threads can't be allowed to exit from > gomp_{,team_}barrier_wait without hitting a release barrier, and > perhaps from gomp_barrier_wait_last too. gomp_barrier_wait_start is a > convenient point to insert the barrier, and a minimal change from the > old code using __sync_add_and_fetch. I can add a comment. ;-) Committed rev 181833.
Index: libgomp/task.c =================================================================== --- libgomp/task.c (revision 181718) +++ libgomp/task.c (working copy) @@ -273,6 +273,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ gomp_team_barrier_done (&team->barrier, state); gomp_mutex_unlock (&team->task_lock); gomp_team_barrier_wake (&team->barrier, 0); + gomp_mutex_lock (&team->task_lock); } } } Index: libgomp/config/linux/bar.h =================================================================== --- libgomp/config/linux/bar.h (revision 181770) +++ libgomp/config/linux/bar.h (working copy) @@ -50,7 +50,7 @@ static inline void gomp_barrier_init (go static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count) { - __sync_fetch_and_add (&bar->awaited, count - bar->total); + __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL); bar->total = count; } @@ -69,10 +69,8 @@ extern void gomp_team_barrier_wake (gomp static inline gomp_barrier_state_t gomp_barrier_wait_start (gomp_barrier_t *bar) { - unsigned int ret = bar->generation & ~3; - /* Do we need any barrier here or is __sync_add_and_fetch acting - as the needed LoadLoad barrier already? */ - ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; + unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3; + ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; return ret; } Index: libgomp/config/linux/bar.c =================================================================== --- libgomp/config/linux/bar.c (revision 181770) +++ libgomp/config/linux/bar.c (working copy) @@ -37,17 +37,15 @@ gomp_barrier_wait_end (gomp_barrier_t *b { /* Next time we'll be awaiting TOTAL threads again. */ bar->awaited = bar->total; - atomic_write_barrier (); - bar->generation += 4; + __atomic_store_4 (&bar->generation, bar->generation + 4, + MEMMODEL_RELEASE); futex_wake ((int *) &bar->generation, INT_MAX); } else { - unsigned int generation = state; - do - do_wait ((int *) &bar->generation, generation); - while (bar->generation == generation); + do_wait ((int *) &bar->generation, state); + while (__atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) == state); } } @@ -81,15 +79,15 @@ gomp_team_barrier_wake (gomp_barrier_t * void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) { - unsigned int generation; + unsigned int generation, gen; if (__builtin_expect ((state & 1) != 0, 0)) { /* Next time we'll be awaiting TOTAL threads again. */ struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; + bar->awaited = bar->total; - atomic_write_barrier (); if (__builtin_expect (team->task_count, 0)) { gomp_barrier_handle_tasks (state); @@ -97,7 +95,7 @@ gomp_team_barrier_wait_end (gomp_barrier } else { - bar->generation = state + 3; + __atomic_store_4 (&bar->generation, state + 3, MEMMODEL_RELEASE); futex_wake ((int *) &bar->generation, INT_MAX); return; } @@ -107,12 +105,16 @@ gomp_team_barrier_wait_end (gomp_barrier do { do_wait ((int *) &bar->generation, generation); - if (__builtin_expect (bar->generation & 1, 0)) - gomp_barrier_handle_tasks (state); - if ((bar->generation & 2)) + gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE); + if (__builtin_expect (gen & 1, 0)) + { + gomp_barrier_handle_tasks (state); + gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE); + } + if ((gen & 2) != 0) generation |= 2; } - while (bar->generation != state + 4); + while (gen != state + 4); } void