From patchwork Sun May 14 10:27:34 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: Thomas Schwinge <thomas@codesourcery.com>
X-Patchwork-Id: 762112
Return-Path: 
 <gcc-patches-return-453611-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 3wQg0939wYz9s84
	for <incoming@patchwork.ozlabs.org>;
	Sun, 14 May 2017 20:28:09 +1000 (AEST)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org
	header.b="tFytF35p"; dkim-atps=neutral
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:from
	:to:cc:subject:in-reply-to:references:date:message-id
	:mime-version:content-type:content-transfer-encoding; q=dns; s=
	default; b=OmI68TD4jWlxroY0+cg6wc1SPzr6nY/UmmZAczf/T4aFiDyAK3JgB
	lTsaj0MthGaiXczyR6XJah3z1Af4YMRG8pX24o0WNreenFMCX4rpv5LUyMJOPLoH
	EupgDdIO7mEExVKedN3P0amcUfROmUtPW60eorvgEo714qK0LObzqw=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:from
	:to:cc:subject:in-reply-to:references:date:message-id
	:mime-version:content-type:content-transfer-encoding; s=default;
	bh=hYVKckayv6e9gnrxiyKNS7e6EAo=; b=tFytF35pyewVSLGp7yK9TAKwOTte
	un/WYGxC6zPqNadi756RJJc1prNvqmStGR9PtEtd+hoE96t7x2OOmUjnMEqCJWIN
	jHegQmoe0+3z+HnQ/wJa7/Sxz6Nop3n8DKNmzLHrlZLAE76i7WJvPafCznL0TxVy
	WulImVAFpysJxa8=
Received: (qmail 71026 invoked by alias); 14 May 2017 10:27:55 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org
Received: (qmail 70939 invoked by uid 89); 14 May 2017 10:27:54 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-24.4 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3,
	RCVD_IN_DNSWL_NONE, SPF_PASS,
	URIBL_RED autolearn=ham version=3.3.2 spammy=UD:y
X-HELO: relay1.mentorg.com
Received: from relay1.mentorg.com (HELO relay1.mentorg.com) (192.94.38.131)
	by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with
	ESMTP; Sun, 14 May 2017 10:27:49 +0000
Received: from nat-ies.mentorg.com ([192.94.31.2]
	helo=svr-ies-mbx-01.mgc.mentorg.com)	by relay1.mentorg.com
	with esmtp id 1d9qke-00078u-Or from
	Thomas_Schwinge@mentor.com ; Sun, 14 May 2017 03:27:49 -0700
Received: from hertz.schwinge.homeip.net (137.202.0.87) by
	svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) with Microsoft
	SMTP Server (TLS) id 15.0.1210.3; Sun, 14 May 2017 11:27:45 +0100
From: Thomas Schwinge <thomas@codesourcery.com>
To: <gcc-patches@gcc.gnu.org>
CC: Jakub Jelinek <jakub@redhat.com>
Subject: Re: Runtime checking of OpenACC parallelism dimensions clauses
In-Reply-To: <87ziej37kq.fsf@hertz.schwinge.homeip.net>
References: <87ziej37kq.fsf@hertz.schwinge.homeip.net>
User-Agent: Notmuch/0.9-101-g81dad07 (http://notmuchmail.org) Emacs/24.5.1
	(x86_64-pc-linux-gnu)
Date: Sun, 14 May 2017 12:27:34 +0200
Message-ID: <87d1bb3f8p.fsf@hertz.schwinge.homeip.net>
MIME-Version: 1.0
X-ClientProxiedBy: svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1) To
	svr-ies-mbx-01.mgc.mentorg.com (139.181.222.1)

Hi!

On Thu, 11 May 2017 14:24:05 +0200, I wrote:
> OK for trunk?

>     Runtime checking of OpenACC parallelism dimensions clauses

For now, committed to gomp-4_0-branch in r248030:

commit 59e5204e0ec16c0f14ec68148f856fd307ef8d51
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Sun May 14 10:25:46 2017 +0000

    Runtime checking of OpenACC parallelism dimensions clauses
    
            libgomp/
            * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
            * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
            (check_effective_target_c++): New procs.
            * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
            (check_effective_target_c++): Likewise.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@248030 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp                             |   8 +
 libgomp/testsuite/libgomp.oacc-c++/c++.exp         |   7 +
 .../libgomp.oacc-c-c++-common/parallel-dims.c      | 526 ++++++++++++++++++++-
 libgomp/testsuite/libgomp.oacc-c/c.exp             |   7 +
 4 files changed, 536 insertions(+), 12 deletions(-)


Grüße
 Thomas

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index def0feb..a1627a8 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,3 +1,11 @@
+2017-05-14  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
+	* testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
+	(check_effective_target_c++): New procs.
+	* testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
+	(check_effective_target_c++): Likewise.
+
 2017-05-12  Cesar Philippidis  <cesar@codesourcery.com>
 
 	* testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c: New test.
diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/libgomp.oacc-c++/c++.exp
index ba1a28e..695b96d 100644
--- libgomp/testsuite/libgomp.oacc-c++/c++.exp
+++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
@@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
 load_gcc_lib gcc-dg.exp
 load_gcc_lib torture-options.exp
 
+proc check_effective_target_c { } {
+    return 0
+}
+proc check_effective_target_c++ { } {
+    return 1
+}
+
 global shlib_ext
 
 set shlib_ext [get_shlib_extension]
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index f5766a4..3458757 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -1,25 +1,527 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
+   vector_length.  */
+
+/* { dg-additional-options "-foffload-force" } */
+
+#include <limits.h>
+#include <openacc.h>
+
+/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
+   not behaving as expected for -O0.  */
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia))
+    {
+      unsigned int r;
+      asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
+      return r;
+    }
+  else
+    __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia))
+    {
+      unsigned int r;
+      asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
+      return r;
+    }
+  else
+    __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia))
+    {
+      unsigned int r;
+      asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
+      return r;
+    }
+  else
+    __builtin_abort ();
+}
 
-/* Worker and vector size checks.  Picked an outrageously large
-   value. */
 
 int main ()
 {
-  int dummy[10];
+  acc_init (acc_device_default);
 
-#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
+  /* Non-positive value.  */
+
+  /* GR, WS, VS.  */
+  {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+    int gangs_actual = GANGS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+    {
+      /* We're actually executing with num_gangs (1).  */
+      gangs_actual = 1;
+      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+	{
+	  /* <https://gcc.gnu.org/PR80547>.  */
+#if 0
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+#else
+	  int gangs = acc_gang ();
+	  gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
+	  gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
+	  int workers = acc_worker ();
+	  workers_min = (workers_min < workers) ? workers_min : workers;
+	  workers_max = (workers_max > workers) ? workers_max : workers;
+	  int vectors = acc_vector ();
+	  vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
+	  vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
+#endif
+	}
+    }
+    if (gangs_actual != 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != gangs_actual - 1
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+#undef GANGS
+  }
+
+  /* GP, WS, VS.  */
+  {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+    int gangs_actual = GANGS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) /* { dg-warning "region contains gang partitoned code but is not gang partitioned" } */ \
+  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+    {
+      /* We're actually executing with num_gangs (1).  */
+      gangs_actual = 1;
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (gangs_actual != 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != gangs_actual - 1
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+#undef GANGS
+  }
+
+  /* GR, WP, VS.  */
+  {
+#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
+    int workers_actual = WORKERS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "region contains worker partitoned code but is not worker partitioned" } */ \
+  num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
+    {
+      /* We're actually executing with num_workers (1).  */
+      workers_actual = 1;
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (workers_actual != 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != workers_actual - 1
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+#undef WORKERS
+  }
+
+  /* GR, WS, VP.  */
+  {
+#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
+    int vectors_actual = VECTORS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "region contains vector partitoned code but is not vector partitioned" } */ \
+  /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } 170 } */ \
+  vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
+    {
+      /* We're actually executing with vector_length (1), just the GCC nvptx
+	 back end enforces vector_length (32).  */
+      if (acc_on_device (acc_device_nvidia))
+	vectors_actual = 32;
+      else
+	vectors_actual = 1;
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (acc_get_device_type () == acc_device_nvidia)
+      {
+	if (vectors_actual != 32)
+	  __builtin_abort ();
+      }
+    else
+      if (vectors_actual != 1)
+	__builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
+      __builtin_abort ();
+#undef VECTORS
+  }
+
+
+  /* High value.  */
+  
+  /* GR, WS, VS.  */
+  {
+    /* There is no actual limit for the number of gangs, so we try with a
+       rather high value.  */
+    int gangs = 12345;
+    int gangs_actual = gangs;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) /* { dg-warning "region is gang partitioned but does not contain gang partitioned code" } */ \
+  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+  num_gangs (gangs)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with num_gangs (1).  */
+	  gangs_actual = 1;
+	}
+      /* As we're executing GR not GP, don't multiply with a "gangs_actual"
+	 factor.  */
+      for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (gangs_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != gangs_actual - 1
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+  }
+
+  /* GP, WS, VS.  */
+  {
+    /* There is no actual limit for the number of gangs, so we try with a
+       rather high value.  */
+    int gangs = 12345;
+    int gangs_actual = gangs;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+  num_gangs (gangs)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with num_gangs (1).  */
+	  gangs_actual = 1;
+	}
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (gangs_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != gangs_actual - 1
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+  }
+
+  /* GR, WP, VS.  */
+  {
+    /* We try with an outrageously large value. */
+#define WORKERS 2 << 20
+    int workers_actual = WORKERS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
+  num_workers (WORKERS)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with num_workers (1).  */
+	  workers_actual = 1;
+	}
+      else if (acc_on_device (acc_device_nvidia))
+	{
+	  /* The GCC nvptx back end enforces num_workers (32).  */
+	  workers_actual = 32;
+	}
+      else
+	__builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (workers_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != workers_actual - 1
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+#undef WORKERS
+  }
+
+  /* GR, WP, VS.  */
+  {
+    /* We try with an outrageously large value. */
+    int workers = 2 << 20;
+    /* For nvptx offloading, this one will not result in "using num_workers
+       (32), ignoring runtime setting", and will in fact try to launch with
+       "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
+       error: invalid argument".  So, limit ourselves here.  */
+    if (acc_get_device_type () == acc_device_nvidia)
+      workers = 32;
+    int workers_actual = workers;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) \
+  num_workers (workers)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with num_workers (1).  */
+	  workers_actual = 1;
+	}
+      else if (acc_on_device (acc_device_nvidia))
+	{
+	  /* We're actually executing with num_workers (32).  */
+	  /* workers_actual = 32; */
+	}
+      else
+	__builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (workers_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != workers_actual - 1
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+  }
+
+  /* GR, WS, VP.  */
   {
-#pragma acc loop worker
-    for (int  i = 0; i < 10; i++)
-      dummy[i] = i;
+    /* We try with an outrageously large value. */
+#define VECTORS 2 << 20
+    int vectors_actual = VECTORS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
+  vector_length (VECTORS)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with vector_length (1).  */
+	  vectors_actual = 1;
+	}
+      else if (acc_on_device (acc_device_nvidia))
+	{
+	  /* The GCC nvptx back end enforces vector_length (32).  */
+	  vectors_actual = 32;
+	}
+      else
+	__builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (vectors_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
+      __builtin_abort ();
+#undef VECTORS
   }
 
-#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
+  /* GR, WS, VP.  */
   {
-#pragma acc loop vector
-    for (int  i = 0; i < 10; i++)
-      dummy[i] = i;
+    /* We try with an outrageously large value. */
+    int vectors = 2 << 20;
+    int vectors_actual = vectors;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_selected } } */ \
+  vector_length (vectors)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with vector_length (1).  */
+	  vectors_actual = 1;
+	}
+      else if (acc_on_device (acc_device_nvidia))
+	{
+	  /* The GCC nvptx back end enforces vector_length (32).  */
+	  vectors_actual = 32;
+	}
+      else
+	__builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (vectors_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != 0
+	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
+      __builtin_abort ();
   }
 
+
+  /* Composition of GP, WP, VP.  */
+  {
+    int gangs = 12345;
+    /* With nvptx offloading, multi-level reductions apparently are very slow
+       in the following case.  So, limit ourselves here.  */
+    if (acc_get_device_type () == acc_device_nvidia)
+      gangs = 3;
+    int gangs_actual = gangs;
+#define WORKERS 3
+    int workers_actual = WORKERS;
+#define VECTORS 11
+    int vectors_actual = VECTORS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_selected } } */ \
+  num_gangs (gangs) \
+  num_workers (WORKERS) \
+  vector_length (VECTORS)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with num_gangs (1), num_workers (1),
+	     vector_length (1).  */
+	  gangs_actual = 1;
+	  workers_actual = 1;
+	  vectors_actual = 1;
+	}
+      else if (acc_on_device (acc_device_nvidia))
+	{
+	  /* The GCC nvptx back end enforces vector_length (32).  */
+	  vectors_actual = 32;
+	}
+      else
+	__builtin_abort ();
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+	for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+	  for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
+	    {
+	      gangs_min = gangs_max = acc_gang ();
+	      workers_min = workers_max = acc_worker ();
+	      vectors_min = vectors_max = acc_vector ();
+	    }
+    }
+    if (gangs_min != 0 || gangs_max != gangs_actual - 1
+	|| workers_min != 0 || workers_max != workers_actual - 1
+	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
+      __builtin_abort ();
+#undef VECTORS
+#undef WORKERS
+  }
+
+
+  /* We can't test parallelized OpenACC kernels constructs in this way: use of
+     the acc_gang, acc_worker, acc_vector functions will make the construct
+     unparallelizable.  */
+
+
+  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+     kernels.  */
+  {
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc kernels
+    {
+      /* This is to make the OpenACC kernels construct unparallelizable.  */
+      asm volatile ("" : : : "memory");
+
+#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100; i > -100; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (gangs_min != 0 || gangs_max != 1 - 1
+	|| workers_min != 0 || workers_max != 1 - 1
+	|| vectors_min != 0 || vectors_max != 1 - 1)
+      __builtin_abort ();
+  }
+
+
   return 0;
 }
diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgomp.oacc-c/c.exp
index e4be086..16f8295 100644
--- libgomp/testsuite/libgomp.oacc-c/c.exp
+++ libgomp/testsuite/libgomp.oacc-c/c.exp
@@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
 load_gcc_lib gcc-dg.exp
 load_gcc_lib torture-options.exp
 
+proc check_effective_target_c { } {
+    return 1
+}
+proc check_effective_target_c++ { } {
+    return 0
+}
+
 # Initialize dg.
 dg-init
 torture-init