From patchwork Tue Oct 20 17:15:15 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Nathan Sidwell <nathan@acm.org>
X-Patchwork-Id: 533084
Return-Path: 
 <gcc-patches-return-410707-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 47320140281
	for <incoming@patchwork.ozlabs.org>;
	Wed, 21 Oct 2015 04:15:48 +1100 (AEDT)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org
	header.b=dnRwgTVu; dkim-atps=neutral
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; q=dns;
	s=default; b=BYChCLJBc+4h7rS6zUP9WY9tMpF5ECGHS74SW/QOcSN/3kV7Qu
	kByti18T8SPGvii6txoVQBXxnhJDarZCm96i43oMiEsUcfey9K6xQNXy4d1M64ar
	EIGIn7mPApoLKFWQVU5io5P8r32ke0LotsQPFB54qxX5yhe6to150bHfk=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:to
	:from:subject:message-id:date:mime-version:content-type; s=
	default; bh=T4CnI8kAr4qEAgldMZmu2QaMgrk=; b=dnRwgTVuQy8ej7B/yUrI
	iwnzNE0UYO54lqW9vSXwLYYfVXXH5Hr/Fbj3UENrzvw9wV42VqrBmTXhIaKPwUJh
	0jQfknzZLPdpik9P/XyVe3ZdT5bqnmuSJ+1zw0mLLNoHJ4Zx8Su3BRtXJVb82zZS
	mWxWW+LIdWxtzzBPfc99o2s=
Received: (qmail 65402 invoked by alias); 20 Oct 2015 17:15:28 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 65181 invoked by uid 89); 20 Oct 2015 17:15:24 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.8 required=5.0 tests=BAYES_00,
	FREEMAIL_FROM, KAM_ASCII_DIVIDERS, RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=no version=3.3.2
X-HELO: mail-qg0-f46.google.com
Received: from mail-qg0-f46.google.com (HELO mail-qg0-f46.google.com)
	(209.85.192.46) by sourceware.org
	(qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256
	encrypted) ESMTPS; Tue, 20 Oct 2015 17:15:19 +0000
Received: by qgem9 with SMTP id m9so21186496qge.1 for
	<gcc-patches@gcc.gnu.org>; Tue, 20 Oct 2015 10:15:17 -0700 (PDT)
X-Received: by 10.140.151.140 with SMTP id 134mr3640036qhx.25.1445361316974;
	Tue, 20 Oct 2015 10:15:16 -0700 (PDT)
Received: from ?IPv6:2601:181:c000:c497:a2a8:cdff:fe3e:b48?
	([2601:181:c000:c497:a2a8:cdff:fe3e:b48]) by
	smtp.googlemail.com with ESMTPSA id
	i34sm1622477qgd.8.2015.10.20.10.15.16 (version=TLSv1.2
	cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);
	Tue, 20 Oct 2015 10:15:16 -0700 (PDT)
To: GCC Patches <gcc-patches@gcc.gnu.org>
From: Nathan Sidwell <nathan@acm.org>
Subject: [gomp4] Update openacc loop iteration partitioning
Message-ID: <562676A3.5010804@acm.org>
Date: Tue, 20 Oct 2015 13:15:15 -0400
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:38.0) Gecko/20100101 Thunderbird/38.3.0
MIME-Version: 1.0

In preparing this patch set for trunk, I discovered I'd flubbed the calculations 
for default contiguous looping.  This fixes the calculation in the target-side 
loop transformation code.  I also realized that the calculation appropriate for 
an accelerator is not the best for the host.  For the latter we want this to 
expand to a regular loop iterator.

Applied to gomp4 branch.

nathan

2015-10-20  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* omp-low.c (expand_oacc_for): Use -1 for unspecified static
	chunking.  Remove unnecessary gimple forcing.
	(oacc_xform_loop): Adjust chunk size calculation.  Don't chunk on
	host.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: New.
	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: New.

Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 229092)
+++ gcc/omp-low.c	(working copy)
@@ -9962,7 +9962,7 @@ expand_oacc_for (struct omp_region *regi
   enum tree_code cond_code = fd->loop.cond_code;
   enum tree_code plus_code = PLUS_EXPR;
 
-  tree chunk_size = integer_one_node;
+  tree chunk_size = integer_minus_one_node;
   tree gwv = integer_zero_node;
   tree iter_type = TREE_TYPE (v);
   tree diff_type = iter_type;
@@ -10110,10 +10110,6 @@ expand_oacc_for (struct omp_region *regi
       ass = gimple_build_assign (chunk_no, expr);
       gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
 
-      expr = fold_convert (diff_type, chunk_size);
-      chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
-					     NULL_TREE, true, GSI_SAME_STMT);
-
       call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
 					 build_int_cst (integer_type_node,
 							IFN_GOACC_LOOP_CHUNKS),
@@ -16892,25 +16888,26 @@ oacc_xform_loop (gcall *call)
   tree dir = gimple_call_arg (call, 1);
   tree range = gimple_call_arg (call, 2);
   tree step = gimple_call_arg (call, 3);
-  tree chunk_size = gimple_call_arg (call, 4);
+  tree chunk_size = NULL_TREE;
   unsigned mask = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 5));
   tree lhs = gimple_call_lhs (call);
   tree type = TREE_TYPE (lhs);
   tree diff_type = TREE_TYPE (range);
   tree r = NULL_TREE;
   gimple_seq seq = NULL;
-  bool chunking, striding;
+  bool chunking = false, striding = true;
   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
 
-  if (integer_zerop (chunk_size))
-    {
-      /* If we're at the gang or (worker with vector), we want each to
-	 execute a contiguous run of iterations.  Otherwise we want
-	 each element to stride.  */
-      striding = !((outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
-		   || ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-		       && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))));
+#ifdef ACCEL_COMPILER
+  chunk_size = gimple_call_arg (call, 4);
+  if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
+      || integer_zerop (chunk_size))   /* Default (also static).  */
+    {
+      /* If we're at the gang level, we want each to execute a
+	 contiguous run of iterations.  Otherwise we want each element
+	 to stride.  */
+      striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
       chunking = false;
     }
   else
@@ -16919,7 +16916,16 @@ oacc_xform_loop (gcall *call)
       striding = integer_onep (chunk_size);
       chunking = !striding;
     }
+#endif
 
+  /* striding=true, chunking=true
+       -> invalid.
+     striding=true, chunking=false
+       -> chunks=1
+     striding=false,chunking=true
+       -> chunks=ceil (range/(chunksize*threads*step))
+     striding=false,chunking=false
+       -> chunk_size=ceil(range/(threads*step)),chunks=1  */
   push_gimplify_context (true);
 
   switch (code)
@@ -16963,31 +16969,25 @@ oacc_xform_loop (gcall *call)
 	}
       else
 	{
-	  tree span;
 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
 				     inner_size, outer_size);
 
+	  volume = fold_convert (diff_type, volume);
 	  if (chunking)
-	    {
-	      chunk_size = fold_convert (diff_type, chunk_size);
-
-	      span = inner_size;
-	      span = fold_convert (diff_type, span);
-	      span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
-	    }
+	    chunk_size = fold_convert (diff_type, chunk_size);
 	  else
 	    {
-	      tree per = fold_convert (diff_type, volume);
-	      per = fold_build2 (MULT_EXPR, diff_type, per, step);
+	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
 
-	      span = build2 (MINUS_EXPR, diff_type, range, dir);
-	      span = build2 (PLUS_EXPR, diff_type, span, per);
-	      span = build2 (TRUNC_DIV_EXPR, diff_type, span, per);
-	      span = build2 (MULT_EXPR, diff_type, span, inner_size);
+	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
 	    }
 
+	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+			      fold_convert (diff_type, inner_size));
 	  r = oacc_thread_numbers (true, outer_mask, &seq);
 	  r = fold_convert (diff_type, r);
 	  r = build2 (MULT_EXPR, diff_type, r, span);
@@ -16998,9 +16998,9 @@ oacc_xform_loop (gcall *call)
 
 	  if (chunking)
 	    {
-	      tree chunk = gimple_call_arg (call, 6);
-	      tree per = fold_convert (diff_type, volume);
-	      per = fold_build2 (MULT_EXPR, diff_type, per, chunk_size);
+	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+	      tree per
+		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
 
 	      r = build2 (PLUS_EXPR, diff_type, r, per);
@@ -17016,29 +17016,29 @@ oacc_xform_loop (gcall *call)
 	r = range;
       else
 	{
-	  tree offset = gimple_call_arg (call, 6);
-	  tree span;
-	  
-	  if (chunking)
-	    {
-	      chunk_size = fold_convert (diff_type, chunk_size);
+	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+				     inner_size, outer_size);
 
-	      span = oacc_thread_numbers (false, inner_mask, &seq);
-	      span = fold_convert (diff_type, span);
-	      span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size);
-	    }
+	  volume = fold_convert (diff_type, volume);
+	  if (chunking)
+	    chunk_size = fold_convert (diff_type, chunk_size);
 	  else
 	    {
-	      tree per = oacc_thread_numbers (false, mask, &seq);
-	      per = fold_convert (diff_type, per);
-	      per = build2 (MULT_EXPR, diff_type, per, step);
-	      span = build2 (MINUS_EXPR, diff_type, range, dir);
-	      span = build2 (PLUS_EXPR, diff_type, span, per);
-	      span = build2 (TRUNC_DIV_EXPR, diff_type, span, per);
+	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+
+	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
 	    }
 
+	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+			      fold_convert (diff_type, inner_size));
+
 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
 
+	  tree offset = gimple_call_arg (call, 6);
 	  r = build2 (PLUS_EXPR, diff_type, r,
 		      fold_convert (diff_type, offset));
 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c	(working copy)
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_gangs(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop gang
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = ix / ((N + 31) / 32);
+	  int w = 0;
+	  int v = 0;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c	(working copy)
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_gangs(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop gang (static:1)
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = ix % 32;
+	  int w = 0;
+	  int v = 0;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c	(working copy)
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop gang worker vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  
+	  int g = ix / (chunk_size * 32 * 32);
+	  int w = ix / 32 % 32;
+	  int v = ix % 32;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c	(working copy)
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = 0;
+	  int w = 0;
+	  int v = ix % 32;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c	(working copy)
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop worker
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = 0;
+	  int w = ix % 32;
+	  int v = 0;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c	(working copy)
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2" */
+
+#include <stdio.h>
+
+#define N (32*32*32+17)
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int ondev = 0;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+  
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+  {
+#pragma acc loop worker vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	if (__builtin_acc_on_device (5))
+	  {
+	    int g = 0, w = 0, v = 0;
+
+	    __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+	    __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+	    __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+	    ary[ix] = (g << 16) | (w << 8) | v;
+	    ondev = 1;
+	  }
+	else
+	  ary[ix] = ix;
+      }
+  }
+
+  for (ix = 0; ix < N; ix++)
+    {
+      int expected = ix;
+      if(ondev)
+	{
+	  int g = 0;
+	  int w = (ix / 32) % 32;
+	  int v = ix % 32;
+
+	  expected = (g << 16) | (w << 8) | v;
+	}
+      
+      if (ary[ix] != expected)
+	{
+	  exit = 1;
+	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
+	}
+    }
+  
+  return exit;
+}