Message ID | 6f2efdb3-c45d-18c7-0b0e-89e91ab32eb4@netcologne.de |
---|---|
State | New |
Headers | show |
Series | [fortran,RFC] Interchange indices for FORALL and DO CONCURRENT if profitable | expand |
Hi Thomas, In general, I like the idea. I have some minor suggestions below. On Sat, Oct 28, 2017 at 12:03:58AM +0200, Thomas Koenig wrote: > +/* Callback function to determine if an expression is the > + corresponding variable. */ > + > +static int static bool > +has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data) > +{ > + gfc_expr *expr = *e; > + gfc_symbol *sym; > + > + if (expr->expr_type != EXPR_VARIABLE) > + return 0; return false; > + > + sym = (gfc_symbol *) data; > + return sym == expr->symtree->n.sym; > +} > + > +/* Callback function to calculate the cost of a certain index. */ This function always returns 0, so > +static int static void > +index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, > + void *data) > +{ > + ind_type *ind; > + gfc_expr *expr; > + gfc_array_ref *ar; > + gfc_ref *ref; > + int i,j; > + > + expr = *e; > + if (expr->expr_type != EXPR_VARIABLE) > + return 0; return; > + > + ar = NULL; > + for (ref = expr->ref; ref; ref = ref->next) > + { > + if (ref->type == REF_ARRAY) > + { > + ar = &ref->u.ar; > + break; > + } > + } > + if (ar == NULL || ar->type != AR_ELEMENT) > + return 0; return; > + > + ind = (ind_type *) data; > + for (i = 0; i < ar->dimen; i++) > + { > + for (j=0; ind[j].sym != NULL; j++) > + { > + if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym))) > + ind[j].n[i]++; > + } > + } > + return 0; Delete this return as a void function that reaches its end will return; > +} > + > +/* Callback function for qsort, to sort the loop indices. */ > + > +static int > +loop_comp (const void *e1, const void *e2) > +{ > + const ind_type *i1 = (const ind_type *) e1; > + const ind_type *i2 = (const ind_type *) e2; > + int i; > + > + for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--) > + { > + if (i1->n[i] != i2->n[i]) > + return i1->n[i] - i2->n[i]; > + } > + /* All other things being equal, let's not change the ordering. */ > + return i2->num - i1->num; > +} > + > +/* Main function to do the index interchange. */ > + This function always returns 0, so > +static int static void > +index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED, > + void *data ATTRIBUTE_UNUSED) > +{ > + gfc_code *co; > + co = *c; > + int n_iter; > + gfc_forall_iterator *fa; > + ind_type *ind; > + int i, j; > + > + if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT) > + return 0; return; > + > + n_iter = 0; > + for (fa = co->ext.forall_iterator; fa; fa = fa->next) > + n_iter ++; > + > + /* Nothing to reorder. */ > + if (n_iter < 2) > + return 0; return; > + > + ind = XALLOCAVEC (ind_type, n_iter + 1); > + > + i = 0; > + for (fa = co->ext.forall_iterator; fa; fa = fa->next) > + { > + ind[i].sym = fa->var->symtree->n.sym; > + ind[i].fa = fa; > + for (j=0; j<GFC_MAX_DIMENSIONS; j++) > + ind[i].n[j] = 0; > + ind[i].num = i; > + i++; > + } > + ind[n_iter].sym = NULL; > + ind[n_iter].fa = NULL; > + > + gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind); > + qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp); > + > + /* Do the actual index interchange. */ > + co->ext.forall_iterator = fa = ind[0].fa; > + for (i=1; i<n_iter; i++) > + { > + fa->next = ind[i].fa; > + fa = fa->next; > + } > + fa->next = NULL; > + > + return 0; Delete this return.
Hi Steve, > On Sat, Oct 28, 2017 at 12:03:58AM +0200, Thomas Koenig wrote: >> +/* Callback function to determine if an expression is the >> + corresponding variable. */ >> + >> +static int > static bool Most of the functions in the patch are callback functions for gfc_code_walker or gfc_expr_walker, respectively. Their function arguments are given as typedef int (*walk_code_fn_t) (gfc_code **, int *, void *); typedef int (*walk_expr_fn_t) (gfc_expr **, int *, void *); respectively, so the types of the functions are fixed. Regards Thomas
On October 28, 2017 12:03:58 AM GMT+02:00, Thomas Koenig <tkoenig@netcologne.de> wrote: >Hello world, > >this is a draft patch which interchanges the indices for FORALL and >DO CONCURRENT loops for cases like PR 82471, where code like > > DO CONCURRENT( K=1:N, J=1:M, I=1:L) > C(I,J,K) = A(I,J,K) + B(I,J,K) > END DO > >led to very poor code because of stride issues. Currently, >Graphite is not able to do this. > >Without the patch, the code above is translated as > > > i.7 = 1; > count.10 = 512; > while (1) > { > if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4; > j.6 = 1; > count.9 = 512; > while (1) > { > if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3; > k.5 = 1; > count.8 = 512; > while (1) > { > if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2; > (*(real(kind=4)[0:] * restrict) c.data)[((c.offset >+ (integer(kind=8)) k.5 * c.dim[2].stride) + (integer(kind=8)) j.6 * >c.dim[1].stride) + (integer(kind=8)) i.7] = (*(real(kind=4)[0:] * >restrict) a.data)[((a.offset + (integer(kind=8)) k.5 * a.dim[2].stride) > >+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.7] + >(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) >k.5 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + >(integer(kind=8)) i.7]; > L.1:; > k.5 = k.5 + 1; > count.8 = count.8 + -1; > } > L.2:; > j.6 = j.6 + 1; > count.9 = count.9 + -1; > } > L.3:; > i.7 = i.7 + 1; > count.10 = count.10 + -1; > } > L.4:; > >so the innermost loop has the biggest stride. With the patch >(and front-end optimization turned on), this is turned into > > k.7 = 1; > count.10 = 512; > while (1) > { > if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4; > j.6 = 1; > count.9 = 512; > while (1) > { > if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3; > i.5 = 1; > count.8 = 512; > while (1) > { > if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2; > (*(real(kind=4)[0:] * restrict) c.data)[((c.offset >+ (integer(kind=8)) k.7 * c.dim[2].stride) + (integer(kind=8)) j.6 * >c.dim[1].stride) + (integer(kind=8)) i.5] = (*(real(kind=4)[0:] * >restrict) a.data)[((a.offset + (integer(kind=8)) k.7 * a.dim[2].stride) > >+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.5] + >(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) >k.7 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + >(integer(kind=8)) i.5]; > L.1:; > i.5 = i.5 + 1; > count.8 = count.8 + -1; > } > L.2:; > j.6 = j.6 + 1; > count.9 = count.9 + -1; > } > L.3:; > k.7 = k.7 + 1; > count.10 = count.10 + -1; > } > L.4:; > >so the innermost loop is the one that gets traversed with unity stride >(the way it should have been done). > >Although the algorithm here is quite simple, it resolves the issues >raised in the PR so far, and it definitely worth fixing. > >If we do this kind of thing, it might also be possible to >interchange normal DO loops in a similar way (which Graphite also >cannot do at the moment, at least not if the bounds are variables). > >So, comments? Suggestions? Other ideas? Any ideas how to write >a test case? Any volunteers to re-implement Graphite in the >Fortran front end (didn't think so) or to make Graphite catch >this sort of pattern (which it currently doesn't) instead? Graphite has major issues with the scalarization of multidimensional arrays to a single array dimension. Indeed doing graphite optimization on the Fortran FE IL would be an interesting experiment. Maybe list that as summer of code project idea. Richard. >Regards > > Thomas > >2017-10-27 Thomas Koenig <tkoenig@gcc.gnu.org> > > * frontend-passes.c (index_interchange): New funciton, > prototype. > (optimize_namespace): Call index_interchange. > (ind_type): New function. > (has_var): New function. > (index_cost): New function. > (loop_comp): New function.
Am 28.10.2017 um 00:03 schrieb Thomas Koenig:
> +typepedef struct {
That should have been typdef, obviously - the typo
must have slipped in after testing.
Regards
Thomas
On Sat, Oct 28, 2017 at 01:23:30PM +0200, Thomas Koenig wrote: > Hi Steve, > > > On Sat, Oct 28, 2017 at 12:03:58AM +0200, Thomas Koenig wrote: > >> +/* Callback function to determine if an expression is the > >> + corresponding variable. */ > >> + > >> +static int > > static bool > > Most of the functions in the patch are callback functions for > gfc_code_walker or gfc_expr_walker, respectively. Their > function arguments are given as > > typedef int (*walk_code_fn_t) (gfc_code **, int *, void *); > typedef int (*walk_expr_fn_t) (gfc_expr **, int *, void *); > > respectively, so the types of the functions are fixed. > Whoops, I didn't realize that the prototypes were related to being call back functions. I noticed the functions were declared as int, but only returned a single value of 0. In any event, if you have already applied the patch, it looks ok to me.
Index: frontend-passes.c =================================================================== --- frontend-passes.c (Revision 253872) +++ frontend-passes.c (Arbeitskopie) @@ -55,6 +55,7 @@ static gfc_expr* check_conjg_transpose_variable (g bool *); static bool has_dimen_vector_ref (gfc_expr *); static int matmul_temp_args (gfc_code **, int *,void *data); +static int index_interchange (gfc_code **, int*, void *); #ifdef CHECKING_P static void check_locus (gfc_namespace *); @@ -1385,6 +1386,9 @@ optimize_namespace (gfc_namespace *ns) NULL); } + gfc_code_walker (&ns->code, index_interchange, dummy_expr_callback, + NULL); + /* BLOCKs are handled in the expression walker below. */ for (ns = ns->contained; ns; ns = ns->sibling) { @@ -4225,6 +4229,157 @@ inline_matmul_assign (gfc_code **c, int *walk_subt return 0; } + +/* Code for index interchange for loops which are grouped together in DO + CONCURRENT or FORALL statements. This is currently only applied if the + iterations are grouped together in a single statement. + + For this transformation, tt is assumed that memory access in strides is + expensive, and that loops which access later indices (which access memory + in bigger strides) shoud be moved to the first loops. + + For this, a loop over all the statements is executed, counting the times + that the loop iteration values are acessed in each index. The loop + indices are then sorted to minimize access to later indces from inner + loops. */ + +/* Type for holding index information. */ + +typepedef struct { + gfc_symbol *sym; + gfc_forall_iterator *fa; + int num; + int n[GFC_MAX_DIMENSIONS]; +} ind_type; + +/* Callback function to determine if an expression is the + corresponding variable. */ + +static int +has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data) +{ + gfc_expr *expr = *e; + gfc_symbol *sym; + + if (expr->expr_type != EXPR_VARIABLE) + return 0; + + sym = (gfc_symbol *) data; + return sym == expr->symtree->n.sym; +} + +/* Callback function to calculate the cost of a certain index. */ + +static int +index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, + void *data) +{ + ind_type *ind; + gfc_expr *expr; + gfc_array_ref *ar; + gfc_ref *ref; + int i,j; + + expr = *e; + if (expr->expr_type != EXPR_VARIABLE) + return 0; + + ar = NULL; + for (ref = expr->ref; ref; ref = ref->next) + { + if (ref->type == REF_ARRAY) + { + ar = &ref->u.ar; + break; + } + } + if (ar == NULL || ar->type != AR_ELEMENT) + return 0; + + ind = (ind_type *) data; + for (i = 0; i < ar->dimen; i++) + { + for (j=0; ind[j].sym != NULL; j++) + { + if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym))) + ind[j].n[i]++; + } + } + return 0; +} + +/* Callback function for qsort, to sort the loop indices. */ + +static int +loop_comp (const void *e1, const void *e2) +{ + const ind_type *i1 = (const ind_type *) e1; + const ind_type *i2 = (const ind_type *) e2; + int i; + + for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--) + { + if (i1->n[i] != i2->n[i]) + return i1->n[i] - i2->n[i]; + } + /* All other things being equal, let's not change the ordering. */ + return i2->num - i1->num; +} + +/* Main function to do the index interchange. */ + +static int +index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + gfc_code *co; + co = *c; + int n_iter; + gfc_forall_iterator *fa; + ind_type *ind; + int i, j; + + if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT) + return 0; + + n_iter = 0; + for (fa = co->ext.forall_iterator; fa; fa = fa->next) + n_iter ++; + + /* Nothing to reorder. */ + if (n_iter < 2) + return 0; + + ind = XALLOCAVEC (ind_type, n_iter + 1); + + i = 0; + for (fa = co->ext.forall_iterator; fa; fa = fa->next) + { + ind[i].sym = fa->var->symtree->n.sym; + ind[i].fa = fa; + for (j=0; j<GFC_MAX_DIMENSIONS; j++) + ind[i].n[j] = 0; + ind[i].num = i; + i++; + } + ind[n_iter].sym = NULL; + ind[n_iter].fa = NULL; + + gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind); + qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp); + + /* Do the actual index interchange. */ + co->ext.forall_iterator = fa = ind[0].fa; + for (i=1; i<n_iter; i++) + { + fa->next = ind[i].fa; + fa = fa->next; + } + fa->next = NULL; + + return 0; +} + #define WALK_SUBEXPR(NODE) \ do \ { \