From patchwork Fri Oct 27 22:03:58 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Thomas Koenig <tkoenig@netcologne.de>
X-Patchwork-Id: 831522
Return-Path: 
 <gcc-patches-return-465398-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Authentication-Results: ozlabs.org;
	spf=pass (mailfrom) smtp.mailfrom=gcc.gnu.org
	(client-ip=209.132.180.131; helo=sourceware.org;
	envelope-from=gcc-patches-return-465398-incoming=patchwork.ozlabs.org@gcc.gnu.org;
	receiver=<UNKNOWN>)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org
	header.b="Z0lGh/sQ"; dkim-atps=neutral
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 3yNyZy6pvXz9sNd
	for <incoming@patchwork.ozlabs.org>;
	Sat, 28 Oct 2017 09:04:25 +1100 (AEDT)
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; q=dns;
	s=default; b=CJzKd8so4e5514YYX3bWUhB0QCQcFChtw634UW07Fi9sq+tQmu
	ips/veNtyR/Y4w9e0tao83VDfptXgVqko48d85mzeEjRMUDUFBMleCRUGCnyRBrH
	cqs+AFhIbcTiAPfeVfel4YjjuUdUFgMgsiR10J0YGYzS+3aC5hmyja4aA=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; s=
	default; bh=hLxkkG9GsvgY/a+471oTdOocL2o=; b=Z0lGh/sQje5VYJmaq8mq
	/UKr3lIjcjghG7Nvkk/7tvCaMryIYHRuS6c8TiZaMdTGWqqk8B3WBTdAMAb5AAuI
	POwwo5jl9RWRpLFMFX0Dcyw39TarzmMZeOZmGU5x3YIQANaambxpDRuPsxwNiRtX
	c0Y/Y6ardHQe+T0ufZ0lxxo=
Received: (qmail 80662 invoked by alias); 27 Oct 2017 22:04:09 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 80637 invoked by uid 89); 27 Oct 2017 22:04:07 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-16.3 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_LOW,
	RP_MATCHES_RCVD,
	SPF_PASS autolearn=ham version=3.3.2 spammy=UD:ar, unity,
	H*r:sk:tkoenig, Hx-spam-relays-external:sk:!192.16
X-Spam-User: qpsmtpd, 2 recipients
X-HELO: cc-smtpout3.netcologne.de
Received: from cc-smtpout3.netcologne.de (HELO cc-smtpout3.netcologne.de)
	(89.1.8.213) by sourceware.org
	(qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP;
	Fri, 27 Oct 2017 22:04:05 +0000
Received: from cc-smtpin2.netcologne.de (cc-smtpin2.netcologne.de
	[89.1.8.202])	by cc-smtpout3.netcologne.de (Postfix) with
	ESMTP id 30B8E12876; Sat, 28 Oct 2017 00:04:01 +0200 (CEST)
Received: from localhost (localhost [127.0.0.1])	by cc-smtpin2.netcologne.de
	(Postfix) with ESMTP id 234CC11E07;
	Sat, 28 Oct 2017 00:04:01 +0200 (CEST)
Received: from [78.35.155.138] (helo=cc-smtpin2.netcologne.de)	by localhost
	with ESMTP (eXpurgate 4.1.9)	(envelope-from
	<tkoenig@netcologne.de>)	id
	59f3ad51-02b8-7f0000012729-7f000001e5f7-1	for
	<multiple-recipients>; Sat, 28 Oct 2017 00:04:01 +0200
Received: from [192.168.178.20] (xdsl-78-35-155-138.netcologne.de
	[78.35.155.138])	(using TLSv1.2 with cipher
	ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))	(No client
	certificate requested)	by cc-smtpin2.netcologne.de (Postfix)
	with ESMTPSA; Sat, 28 Oct 2017 00:03:59 +0200 (CEST)
To: "fortran@gcc.gnu.org" <fortran@gcc.gnu.org>,
	gcc-patches <gcc-patches@gcc.gnu.org>
From: Thomas Koenig <tkoenig@netcologne.de>
Subject: [patch, fortran,
	RFC] Interchange indices for FORALL and DO CONCURRENT if profitable
Message-ID: <6f2efdb3-c45d-18c7-0b0e-89e91ab32eb4@netcologne.de>
Date: Sat, 28 Oct 2017 00:03:58 +0200
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:52.0) Gecko/20100101 Thunderbird/52.4.0
MIME-Version: 1.0

Hello world,

this is a draft patch which interchanges the indices for FORALL and
DO CONCURRENT loops for cases like PR 82471, where code like

   DO CONCURRENT( K=1:N, J=1:M, I=1:L)
      C(I,J,K) = A(I,J,K) + B(I,J,K)
   END DO

led to very poor code because of stride issues.  Currently,
Graphite is not able to do this.

Without the patch, the code above is translated as


         i.7 = 1;
         count.10 = 512;
         while (1)
           {
             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
             j.6 = 1;
             count.9 = 512;
             while (1)
               {
                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                 k.5 = 1;
                 count.8 = 512;
                 while (1)
                   {
                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
                     (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
+ (integer(kind=8)) k.5 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
c.dim[1].stride) + (integer(kind=8)) i.7] = (*(real(kind=4)[0:] * 
restrict) a.data)[((a.offset + (integer(kind=8)) k.5 * a.dim[2].stride) 
+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.7] + 
(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
k.5 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
(integer(kind=8)) i.7];
                     L.1:;
                     k.5 = k.5 + 1;
                     count.8 = count.8 + -1;
                   }
                 L.2:;
                 j.6 = j.6 + 1;
                 count.9 = count.9 + -1;
               }
             L.3:;
             i.7 = i.7 + 1;
             count.10 = count.10 + -1;
           }
         L.4:;

so the innermost loop has the biggest stride. With the patch
(and front-end optimization turned on), this is turned into

         k.7 = 1;
         count.10 = 512;
         while (1)
           {
             if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
             j.6 = 1;
             count.9 = 512;
             while (1)
               {
                 if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                 i.5 = 1;
                 count.8 = 512;
                 while (1)
                   {
                     if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
                     (*(real(kind=4)[0:] * restrict) c.data)[((c.offset 
+ (integer(kind=8)) k.7 * c.dim[2].stride) + (integer(kind=8)) j.6 * 
c.dim[1].stride) + (integer(kind=8)) i.5] = (*(real(kind=4)[0:] * 
restrict) a.data)[((a.offset + (integer(kind=8)) k.7 * a.dim[2].stride) 
+ (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.5] + 
(*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) 
k.7 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + 
(integer(kind=8)) i.5];
                     L.1:;
                     i.5 = i.5 + 1;
                     count.8 = count.8 + -1;
                   }
                 L.2:;
                 j.6 = j.6 + 1;
                 count.9 = count.9 + -1;
               }
             L.3:;
             k.7 = k.7 + 1;
             count.10 = count.10 + -1;
           }
         L.4:;

so the innermost loop is the one that gets traversed with unity stride
(the way it should have been done).

Although the algorithm here is quite simple, it resolves the issues
raised in the PR so far, and it definitely worth fixing.

If we do this kind of thing, it might also be possible to
interchange normal DO loops in a similar way (which Graphite also
cannot do at the moment, at least not if the bounds are variables).

So, comments? Suggestions? Other ideas? Any ideas how to write
a test case? Any volunteers to re-implement Graphite in the
Fortran front end (didn't think so) or to make Graphite catch
this sort of pattern (which it currently doesn't) instead?

Regards

	Thomas

2017-10-27  Thomas Koenig  <tkoenig@gcc.gnu.org>

         * frontend-passes.c (index_interchange): New funciton,
         prototype.
         (optimize_namespace): Call index_interchange.
         (ind_type): New function.
         (has_var): New function.
         (index_cost): New function.
         (loop_comp): New function.

Index: frontend-passes.c
===================================================================
--- frontend-passes.c	(Revision 253872)
+++ frontend-passes.c	(Arbeitskopie)
@@ -55,6 +55,7 @@ static gfc_expr* check_conjg_transpose_variable (g
 						 bool *);
 static bool has_dimen_vector_ref (gfc_expr *);
 static int matmul_temp_args (gfc_code **, int *,void *data);
+static int index_interchange (gfc_code **, int*, void *);
 
 #ifdef CHECKING_P
 static void check_locus (gfc_namespace *);
@@ -1385,6 +1386,9 @@ optimize_namespace (gfc_namespace *ns)
 		       NULL);
     }
 
+  gfc_code_walker (&ns->code, index_interchange, dummy_expr_callback,
+		   NULL);
+
   /* BLOCKs are handled in the expression walker below.  */
   for (ns = ns->contained; ns; ns = ns->sibling)
     {
@@ -4225,6 +4229,157 @@ inline_matmul_assign (gfc_code **c, int *walk_subt
   return 0;
 }
 
+
+/* Code for index interchange for loops which are grouped together in DO
+   CONCURRENT or FORALL statements.  This is currently only applied if the
+   iterations are grouped together in a single statement.
+
+   For this transformation, tt is assumed that memory access in strides is
+   expensive, and that loops which access later indices (which access memory
+   in bigger strides) shoud be moved to the first loops.
+
+   For this, a loop over all the statements is executed, counting the times
+   that the loop iteration values are acessed in each index.  The loop
+   indices are then sorted to minimize access to later indces from inner
+   loops.  */
+
+/* Type for holding index information.  */
+
+typepedef struct {
+  gfc_symbol *sym;
+  gfc_forall_iterator *fa;
+  int num;
+  int n[GFC_MAX_DIMENSIONS];
+} ind_type;
+
+/* Callback function to determine if an expression is the 
+   corresponding variable.  */
+
+static int
+has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
+{
+  gfc_expr *expr = *e;
+  gfc_symbol *sym;
+
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  sym = (gfc_symbol *) data;
+  return sym == expr->symtree->n.sym;
+}
+
+/* Callback function to calculate the cost of a certain index.  */
+
+static int
+index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
+	    void *data)
+{
+  ind_type *ind;
+  gfc_expr *expr;
+  gfc_array_ref *ar;
+  gfc_ref *ref;
+  int i,j;
+
+  expr = *e;
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  ar = NULL;
+  for (ref = expr->ref; ref; ref = ref->next)
+    {
+      if (ref->type == REF_ARRAY)
+	{
+	  ar = &ref->u.ar;
+	  break;
+	}
+    }
+  if (ar == NULL || ar->type != AR_ELEMENT)
+    return 0;
+
+  ind = (ind_type *) data;
+  for (i = 0; i < ar->dimen; i++)
+    {
+      for (j=0; ind[j].sym != NULL; j++)
+	{
+	  if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym)))
+	      ind[j].n[i]++;
+	}
+    }
+  return 0;
+}
+
+/* Callback function for qsort, to sort the loop indices. */
+
+static int
+loop_comp (const void *e1, const void *e2)
+{
+  const ind_type *i1 = (const ind_type *) e1;
+  const ind_type *i2 = (const ind_type *) e2;
+  int i;
+
+  for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--)
+    {
+      if (i1->n[i] != i2->n[i])
+	return i1->n[i] - i2->n[i];
+    }
+  /* All other things being equal, let's not change the ordering.  */
+  return i2->num - i1->num;
+}
+
+/* Main function to do the index interchange.  */
+
+static int
+index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
+		  void *data ATTRIBUTE_UNUSED)
+{
+  gfc_code *co;
+  co = *c;
+  int n_iter;
+  gfc_forall_iterator *fa;
+  ind_type *ind;
+  int i, j;
+  
+  if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT)
+    return 0;
+
+  n_iter = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    n_iter ++;
+
+  /* Nothing to reorder. */
+  if (n_iter < 2)
+    return 0;
+
+  ind = XALLOCAVEC (ind_type, n_iter + 1);
+
+  i = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    {
+      ind[i].sym = fa->var->symtree->n.sym;
+      ind[i].fa = fa;
+      for (j=0; j<GFC_MAX_DIMENSIONS; j++)
+	ind[i].n[j] = 0;
+      ind[i].num = i;
+      i++;
+    }
+  ind[n_iter].sym = NULL;
+  ind[n_iter].fa = NULL;
+
+  gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind);
+  qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp);
+
+  /* Do the actual index interchange.  */
+  co->ext.forall_iterator = fa = ind[0].fa;
+  for (i=1; i<n_iter; i++)
+    {
+      fa->next = ind[i].fa;
+      fa = fa->next;
+    }
+  fa->next = NULL;
+
+  return 0;
+}
+
 #define WALK_SUBEXPR(NODE) \
   do							\
     {							\