From patchwork Fri Apr 20 14:01:56 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tom de Vries <Tom_deVries@mentor.com>
X-Patchwork-Id: 901942
Return-Path: 
 <gcc-patches-return-476676-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Authentication-Results: ozlabs.org;
	spf=pass (mailfrom) smtp.mailfrom=gcc.gnu.org
	(client-ip=209.132.180.131; helo=sourceware.org;
	envelope-from=gcc-patches-return-476676-incoming=patchwork.ozlabs.org@gcc.gnu.org;
	receiver=<UNKNOWN>)
Authentication-Results: ozlabs.org;
	dmarc=none (p=none dis=none) header.from=mentor.com
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org
	header.b="j66c5bmx"; dkim-atps=neutral
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 40SHbv43cCz9s5f
	for <incoming@patchwork.ozlabs.org>;
	Sat, 21 Apr 2018 00:02:18 +1000 (AEST)
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; q=dns;
	s=default; b=eEPdJmRhOSRSf6Yrx2casqGQFEImMQBKT3DhCLl6kxq+YV9dZw
	bQ/NqHTrTQZOujPC5jf/7rsCP/L1MS68P52Qvmg2lSp2YCBf6UUKi8TyAVg8VviY
	gdxb0nf63nKDNz+ZFNN3SBBZ/flPq6NgYmHbkDwkMmciFUYQQt67sBJJw=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; s=
	default; bh=zhqhrZ/374t+RKhhJt0pjfmr0IU=; b=j66c5bmx9TyUwMhSpH3k
	sscT/VO9yMUpbXJh8SYkQ0pz0Rdcxaw6rq1qq25UpHp7UDSewtN659TGO3RzTtr8
	4jYHhUjzaogck4/ZHCkHVvdyAWMUVf5zRbKE1q9D5Gm7BP2SJClIwH9JDuDtIpKo
	+a2iW0roDjs+8Qp8fujGJjQ=
Received: (qmail 69486 invoked by alias); 20 Apr 2018 14:02:10 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 69470 invoked by uid 89); 20 Apr 2018 14:02:09 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-24.9 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3,
	RCVD_IN_DNSWL_NONE, SPF_PASS,
	URIBL_RED autolearn=ham version=3.3.2 spammy=
X-HELO: relay1.mentorg.com
Received: from relay1.mentorg.com (HELO relay1.mentorg.com) (192.94.38.131)
	by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with
	ESMTP; Fri, 20 Apr 2018 14:02:03 +0000
Received: from nat-ies.mentorg.com ([192.94.31.2]
	helo=svr-ies-mbx-01.mgc.mentorg.com)	by relay1.mentorg.com
	with esmtps (TLSv1.2:ECDHE-RSA-AES256-SHA384:256)	id
	1f9Wbw-0004gx-UT from Tom_deVries@mentor.com for
	gcc-patches@gcc.gnu.org; Fri, 20 Apr 2018 07:02:01 -0700
Received: from [172.30.72.29] (137.202.0.87) by
	svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) with Microsoft
	SMTP Server (TLS) id 15.0.1320.4; Fri, 20 Apr 2018 15:01:57 +0100
To: GCC Patches <gcc-patches@gcc.gnu.org>
From: Tom de Vries <Tom_deVries@mentor.com>
Subject: [nvptx, PR85445, committed] Fix calls to vector and worker routines
Message-ID: <88637356-6d57-49b2-eb55-6aa0af3a35ad@mentor.com>
Date: Fri, 20 Apr 2018 16:01:56 +0200
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:52.0) Gecko/20100101 Thunderbird/52.7.0
MIME-Version: 1.0
X-ClientProxiedBy: svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) To
	svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1)

Hi,

Consider this test-case (minimized from the test-case in the patch):
...
#pragma acc routine vector
static void __attribute__((always_inline))
Vector (int *ptr, int n, const int &inc)
{
#pragma acc loop vector
   for (unsigned ix = 0; ix < n; ix++)
     ptr[ix] += inc;
}

#pragma acc routine worker
void __attribute__((noinline, noclone))
Worker (int *ptr, int m, int n, const int &inc)
{
#pragma acc loop worker
   for (unsigned ix = 0; ix < m; ix++)
     Vector(ptr + ix * n, n, inc);
}

int
main (void)
{
   const int n = 32, m = 32;

   int ary[m][n];
   unsigned ix,  iy;

#pragma acc parallel copy(ary)
   Worker (&ary[0][0], m, n, 1 << 16);

   return 0;
}
...


The inc parameter is a reference parameter, so the argument 1<<16 
(65536) is saved on stack:
...
                 mov.u32 %r25, 65536; 

                 st.u32  [%frame], %r25; 

...

and the address is passed as argument:
...
                 .param.u64 %out_arg4;
                 st.param.u64 [%out_arg4], %frame;
                 call _Z6WorkerPiiiRKi, (%out_arg1, %out_arg2, 
%out_arg3, %out_arg4);
...

The stack is declared with .local:
...
         .local .align 16 .b8 %frame_ar[16];
         .reg.u64 %frame;
         cvta.local.u64 %frame, %frame_ar;
...

which in ptx means:
...
Local memory, private to each thread.
...

The initialization of the stack is done in thread W0V0, but the stack is 
read in  WAVA mode, so it's reading uninitialized stack memory in all 
but the W0V0 thread.

The patch (r239736 in og7) fixes this by broadcasting the stack from 
W0V0 to WAVA before the call.

Build x86_64 with nvptx accelerator and reg-tested libgomp.

Committed to stage4 trunk.

Thanks,
- Tom

[nvptx] Fix calls to vector and worker routines

2019-04-20  Nathan Sidwell  <nathan@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>

	PR target/85445
	* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
	Emit insns for calls too.
	(nvptx_find_par): Always look for worker-level predecessor insn.
	(nvptx_propagate): Add is_call parm, return bool.  Copy frame for
	calls.
	(nvptx_vpropagate, nvptx_wpropagate): Adjust.
	(nvptx_process_pars): Propagate frames for calls.

	* testsuite/libgomp.oacc-c++/ref-1.C: New.

---
 gcc/config/nvptx/nvptx.c                   | 106 ++++++++++++++++-------------
 libgomp/testsuite/libgomp.oacc-c++/ref-1.C |  78 +++++++++++++++++++++
 2 files changed, 138 insertions(+), 46 deletions(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 131b495..ca3fea3 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call)
 	 it creates a block with a single successor before entering a
 	 partitooned region.  That is a good candidate for the end of
 	 an SESE region.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_fork (op));
       emit_insn (gen_nvptx_forked (op));
     }
 }
@@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call)
       /* Emit joining for all non-call pars to ensure there's a single
 	 predecessor for the block the join insn ends up in.  This is
 	 needed for skipping entire loops.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_joining (op));
       emit_insn (gen_nvptx_join (op));
     }
 }
@@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 	    par = new parallel (par, mask);
 	    par->forked_block = block;
 	    par->forked_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->fork_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	  }
@@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 	    gcc_assert (par->mask == mask);
 	    par->join_block = block;
 	    par->join_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->joining_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 	    par = par->parent;
@@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
 #undef BB_SET_SESE
 #undef BB_GET_SESE
 
-/* Propagate live state at the start of a partitioned region.  BLOCK
-   provides the live register information, and might not contain
-   INSN. Propagation is inserted just after INSN. RW indicates whether
-   we are reading and/or writing state.  This
+/* Propagate live state at the start of a partitioned region.  IS_CALL
+   indicates whether the propagation is for a (partitioned) call
+   instruction.  BLOCK provides the live register information, and
+   might not contain INSN. Propagation is inserted just after INSN. RW
+   indicates whether we are reading and/or writing state.  This
    separation is needed for worker-level proppagation where we
    essentially do a spill & fill.  FN is the underlying worker
    function to generate the propagation instructions for single
    register.  DATA is user data.
 
-   We propagate the live register set and the entire frame.  We could
-   do better by (a) propagating just the live set that is used within
-   the partitioned regions and (b) only propagating stack entries that
-   are used.  The latter might be quite hard to determine.  */
+   Returns true if we didn't emit any instructions.
+
+   We propagate the live register set for non-calls and the entire
+   frame for calls and non-calls.  We could do better by (a)
+   propagating just the live set that is used within the partitioned
+   regions and (b) only propagating stack entries that are used.  The
+   latter might be quite hard to determine.  */
 
 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
 
-static void
-nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
-		 propagator_fn fn, void *data)
+static bool
+nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
+		 propagate_mask rw, propagator_fn fn, void *data)
 {
   bitmap live = DF_LIVE_IN (block);
   bitmap_iterator iterator;
   unsigned ix;
+  bool empty = true;
 
   /* Copy the frame array.  */
   HOST_WIDE_INT fs = get_frame_size ();
@@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
       rtx pred = NULL_RTX;
       rtx_code_label *label = NULL;
 
+      empty = false;
       /* The frame size might not be DImode compatible, but the frame
 	 array's declaration will be.  So it's ok to round up here.  */
       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
@@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
       insn = emit_insn_after (cpy, insn);
     }
 
-  /* Copy live registers.  */
-  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
-    {
-      rtx reg = regno_reg_rtx[ix];
+  if (!is_call)
+    /* Copy live registers.  */
+    EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+      {
+	rtx reg = regno_reg_rtx[ix];
 
-      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
-	{
-	  rtx bcast = fn (reg, rw, 0, data);
+	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+	  {
+	    rtx bcast = fn (reg, rw, 0, data);
 
-	  insn = emit_insn_after (bcast, insn);
-	}
-    }
+	    insn = emit_insn_after (bcast, insn);
+	    empty = false;
+	  }
+      }
+  return empty;
 }
 
 /* Worker for nvptx_vpropagate.  */
@@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm,
 }
 
 /* Propagate state that is live at start of BLOCK across the vectors
-   of a single warp.  Propagation is inserted just after INSN.   */
+   of a single warp.  Propagation is inserted just after INSN.
+   IS_CALL and return as for nvptx_propagate.  */
 
-static void
-nvptx_vpropagate (basic_block block, rtx_insn *insn)
+static bool
+nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
 {
-  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
 }
 
 /* Worker for nvptx_wpropagate.  */
@@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
    indicates if this is just before partitioned mode (do spill), or
    just after it starts (do fill). Sequence is inserted just after
-   INSN.  */
+   INSN.  IS_CALL and return as for nvptx_propagate.  */
 
-static void
-nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+static bool
+nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
 {
   wcast_data_t data;
 
@@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
   data.offset = 0;
   data.ptr = NULL_RTX;
 
-  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  bool empty = nvptx_propagate (is_call, block, insn,
+				pre_p ? PM_read : PM_write, wprop_gen, &data);
+  gcc_assert (empty == !data.offset);
   if (data.offset)
     {
       /* Stuff was emitted, initialize the base pointer now.  */
@@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
       if (worker_bcast_size < data.offset)
 	worker_bcast_size = data.offset;
     }
+  return empty;
 }
 
 /* Emit a worker-level synchronization barrier.  We use different
@@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par)
       inner_mask |= par->inner_mask;
     }
 
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-    /* No propagation needed for a call.  */;
-  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
     {
-      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
-      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
-      /* Insert begin and end synchronizations.  */
-      emit_insn_before (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->join_insn);
+      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
+      bool empty = nvptx_wpropagate (true, is_call,
+				     par->forked_block, par->fork_insn);
+
+      if (!empty || !is_call)
+	{
+	  /* Insert begin and end synchronizations.  */
+	  emit_insn_before (nvptx_wsync (false), par->forked_insn);
+	  emit_insn_before (nvptx_wsync (true), par->join_insn);
+	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (par->forked_block, par->forked_insn);
+    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
 
   /* Now do siblings.  */
   if (par->next)
diff --git a/libgomp/testsuite/libgomp.oacc-c++/ref-1.C b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C
new file mode 100644
index 0000000..b3aaf0f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+
+#include <stdio.h>
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+Vector (int *ptr, int n, const int &inc)
+{
+#pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+#pragma acc routine worker
+void __attribute__((noinline, noclone))
+Worker (int *ptr, int m, int n, const int &inc)
+{
+#pragma acc loop worker
+  for (unsigned ix = 0; ix < m; ix++)
+    Vector(ptr + ix * n, n, inc);
+}
+
+int
+main (void)
+{
+  const int n = 32, m = 32;
+
+  int ary[m][n];
+  unsigned ix,  iy;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (ix << 8) + iy;
+
+#pragma acc parallel copy(ary)
+  {
+    Worker (&ary[0][0], m, n, 1 << 16);
+  }
+
+  int err = 0;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+#pragma acc parallel copy(ary)
+  {
+    Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
+  }
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+  return 0;
+}