From 3861d362ec7e3c50742fc43833fe9d8674f4070e Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <tschwinge@baylibre.com>
Date: Sat, 7 Dec 2024 00:17:49 +0100
Subject: [PATCH] nvptx: PTX 'alloca' for '-mptx=7.3'+, '-march=sm_52'+
[PR65181]
..., and use it for '-mno-soft-stack': PTX "native" stacks.
PR target/65181
gcc/
* config/nvptx/nvptx.cc (nvptx_get_drap_rtx): Handle
'!TARGET_SOFT_STACK'.
* config/nvptx/nvptx.md (define_c_enum "unspec"): Add
'UNSPEC_STACKSAVE', 'UNSPEC_STACKRESTORE'.
(define_expand "allocate_stack", define_expand "save_stack_block")
(define_expand "save_stack_block"): Handle '!TARGET_SOFT_STACK',
PTX 'alloca'.
(define_insn "@nvptx_alloca_<mode>")
(define_insn "@nvptx_stacksave_<mode>")
(define_insn "@nvptx_stackrestore_<mode>"): New.
* doc/invoke.texi (Nvidia PTX Options): Update '-msoft-stack',
'-mno-soft-stack'.
* doc/sourcebuild.texi (nvptx-specific attributes): Document
'nvptx_runtime_alloca_ptx'.
(Add Options): Document 'nvptx_alloca_ptx'.
gcc/testsuite/
* gcc.target/nvptx/alloca-1.c: Evolve into...
* gcc.target/nvptx/alloca-1-O0.c: ... this, ...
* gcc.target/nvptx/alloca-1-O1.c: ... this, and...
* gcc.target/nvptx/alloca-1-sm_30.c: ... this.
* gcc.target/nvptx/vla-1.c: Evolve into...
* gcc.target/nvptx/vla-1-O0.c: ... this, ...
* gcc.target/nvptx/vla-1-O1.c: ... this, and...
* gcc.target/nvptx/vla-1-sm_30.c: ... this.
* gcc.c-torture/execute/pr36321.c: Adjust.
* gcc.target/nvptx/__builtin_alloca_0-1-O0.c: Likewise.
* gcc.target/nvptx/__builtin_alloca_0-1-O1.c: Likewise.
* gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1.c:
Likewise.
* gcc.target/nvptx/softstack.c: Likewise.
* gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c:
New.
* gcc.target/nvptx/alloca-2-O0.c: Likewise.
* gcc.target/nvptx/alloca-3-O1.c: Likewise.
* gcc.target/nvptx/alloca-4-O3.c: Likewise.
* gcc.target/nvptx/alloca-5.c: Likewise.
* lib/target-supports.exp (check_effective_target_alloca): Adjust.
(check_nvptx_default_ptx_isa_target_architecture_at_least)
(check_nvptx_runtime_ptx_isa_target_architecture_at_least)
(check_effective_target_nvptx_runtime_alloca_ptx)
(add_options_for_nvptx_alloca_ptx): New.
libgomp/
* fortran.c (omp_get_device_from_uid_): Adjust.
* testsuite/libgomp.oacc-fortran/privatized-ref-2.f90: Likewise.
---
gcc/config/nvptx/nvptx.cc | 4 +-
gcc/config/nvptx/nvptx.md | 92 ++++++++++++---
gcc/doc/invoke.texi | 13 ++-
gcc/doc/sourcebuild.texi | 6 +
gcc/testsuite/gcc.c-torture/execute/pr36321.c | 3 +
.../nvptx/__builtin_alloca_0-1-O0.c | 2 +
.../nvptx/__builtin_alloca_0-1-O1.c | 2 +
...ack_save___builtin_stack_restore-1-sm_30.c | 28 +++++
...tin_stack_save___builtin_stack_restore-1.c | 8 +-
gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c | 49 ++++++++
gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c | 33 ++++++
.../nvptx/{alloca-1.c => alloca-1-sm_30.c} | 1 +
gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c | 12 ++
gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c | 40 +++++++
gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c | 55 +++++++++
gcc/testsuite/gcc.target/nvptx/alloca-5.c | 107 ++++++++++++++++++
gcc/testsuite/gcc.target/nvptx/softstack.c | 2 +
gcc/testsuite/gcc.target/nvptx/vla-1-O0.c | 29 +++++
gcc/testsuite/gcc.target/nvptx/vla-1-O1.c | 40 +++++++
.../nvptx/{vla-1.c => vla-1-sm_30.c} | 1 +
gcc/testsuite/lib/target-supports.exp | 105 ++++++++++++++++-
libgomp/fortran.c | 4 +-
.../libgomp.oacc-fortran/privatized-ref-2.f90 | 10 --
23 files changed, 611 insertions(+), 35 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/nvptx/__builtin_stack_save___builtin_stack_restore-1-sm_30.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-1-O0.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-1-O1.c
rename gcc/testsuite/gcc.target/nvptx/{alloca-1.c => alloca-1-sm_30.c} (83%)
create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-2-O0.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-3-O1.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-4-O3.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/alloca-5.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/vla-1-O0.c
create mode 100644 gcc/testsuite/gcc.target/nvptx/vla-1-O1.c
rename gcc/testsuite/gcc.target/nvptx/{vla-1.c => vla-1-sm_30.c} (83%)
@@ -245,7 +245,7 @@ default_ptx_version_option (void)
warp convergence. */
res = MAX (res, PTX_VERSION_6_0);
- /* For sm_52+, pick at least 7.3. */
+ /* For sm_52+, pick at least 7.3, to enable PTX 'alloca'. */
if (ptx_isa_option >= PTX_ISA_SM52)
res = MAX (res, PTX_VERSION_7_3);
@@ -1797,7 +1797,7 @@ nvptx_function_ok_for_sibcall (tree, tree)
static rtx
nvptx_get_drap_rtx (void)
{
- if (TARGET_SOFT_STACK && stack_realign_drap)
+ if (stack_realign_drap)
return arg_pointer_rtx;
return NULL_RTX;
}
@@ -35,8 +35,9 @@
UNSPEC_FPINT_NEARBYINT
UNSPEC_ALLOCA
-
UNSPEC_SET_SOFTSTACK
+ UNSPEC_STACKSAVE
+ UNSPEC_STACKRESTORE
UNSPEC_DIM_SIZE
@@ -1663,22 +1664,47 @@
(match_operand 1 "nvptx_register_operand")]
""
{
- if (TARGET_SOFT_STACK)
+ if (!TARGET_SOFT_STACK
+ && TARGET_PTX_7_3
+ && TARGET_SM52)
+ emit_insn (gen_nvptx_alloca (Pmode, operands[0], operands[1]));
+ else if (!TARGET_SOFT_STACK)
+ {
+ sorry ("target cannot support alloca");
+ emit_insn (gen_nop ());
+ }
+ else if (TARGET_SOFT_STACK)
{
emit_move_insn (stack_pointer_rtx,
gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
emit_insn (gen_set_softstack (Pmode, stack_pointer_rtx));
emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
- DONE;
}
- /* The ptx documentation specifies an alloca intrinsic (for 32 bit
- only) but notes it is not implemented. The assembler emits a
- confused error message. Issue a blunt one now instead. */
- sorry ("target cannot support alloca");
- emit_insn (gen_nop ());
+ else
+ gcc_unreachable ();
DONE;
})
+(define_insn "@nvptx_alloca_<mode>"
+ [(set (match_operand:P 0 "nvptx_register_operand" "=R")
+ (unspec:P [(match_operand:P 1 "nvptx_nonmemory_operand" "Ri")]
+ UNSPEC_ALLOCA))]
+ "TARGET_PTX_7_3
+ && TARGET_SM52"
+ {
+ /* Convert the address from '.local' state space to generic. That way,
+ we don't have to use 'st.local', 'ld.local', and can easily pass the
+ address to other "generic functions".
+ TODO 'gcc.target/nvptx/alloca-5.c' */
+ output_asm_insn ("{", NULL);
+ output_asm_insn ("\\t.reg%t0\\t%0_local;", operands);
+ output_asm_insn ("\\talloca%u0\\t%0_local, %1;", operands);
+ output_asm_insn ("\\tcvta.local%u0\\t%0, %0_local;", operands);
+ output_asm_insn ("}", NULL);
+ return "";
+ }
+ [(set_attr "predicable" "no")])
+
(define_insn "@set_softstack_<mode>"
[(unspec [(match_operand:P 0 "nvptx_register_operand" "R")]
UNSPEC_SET_SOFTSTACK)]
@@ -1692,30 +1718,64 @@
(match_operand 1 "register_operand" "")]
"!TARGET_SOFT_STACK"
{
- /* The concept of a '%stack' pointer doesn't apply like this for
- PTX "native" stacks. GCC however occasionally synthesizes
- '__builtin_stack_save ()', '__builtin_stack_restore ()', and isn't able to
- optimize them all away. Just submit a dummy -- user code shouldn't be
- able to observe this. */
- emit_move_insn (operands[0], GEN_INT (0xdeadbeef));
+ if (TARGET_PTX_7_3
+ && TARGET_SM52)
+ {
+ gcc_checking_assert (REG_P (operands[0]));
+ emit_insn (gen_nvptx_stacksave (Pmode, operands[0], operands[1]));
+ }
+ else
+ {
+ /* The concept of a '%stack' pointer doesn't apply like this.
+ GCC however occasionally synthesizes '__builtin_stack_save ()',
+ '__builtin_stack_restore ()', and isn't able to optimize them all
+ away. Just submit a dummy -- user code shouldn't be able to observe
+ this. */
+ emit_move_insn (operands[0], GEN_INT (0xdeadbeef));
+ }
DONE;
})
+(define_insn "@nvptx_stacksave_<mode>"
+ [(set (match_operand:P 0 "nvptx_register_operand" "=R")
+ (unspec:P [(match_operand:P 1 "register_operand" "R")]
+ UNSPEC_STACKSAVE))]
+ "TARGET_PTX_7_3
+ && TARGET_SM52"
+ "%.\\tstacksave%u0\\t%0;")
+
(define_expand "restore_stack_block"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]
""
{
- if (!TARGET_SOFT_STACK)
+ if (!TARGET_SOFT_STACK
+ && TARGET_PTX_7_3
+ && TARGET_SM52)
+ {
+ operands[1] = force_reg (Pmode, operands[1]);
+ emit_insn (gen_nvptx_stackrestore (Pmode, operands[0], operands[1]));
+ }
+ else if (!TARGET_SOFT_STACK)
; /* See 'save_stack_block'. */
- else
+ else if (TARGET_SOFT_STACK)
{
emit_move_insn (operands[0], operands[1]);
emit_insn (gen_set_softstack (Pmode, operands[0]));
}
+ else
+ gcc_unreachable ();
DONE;
})
+(define_insn "@nvptx_stackrestore_<mode>"
+ [(set (match_operand:P 0 "nvptx_register_operand" "=R")
+ (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
+ UNSPEC_STACKRESTORE))]
+ "TARGET_PTX_7_3
+ && TARGET_SM52"
+ "%.\\tstackrestore%u1\\t%1;")
+
(define_expand "save_stack_function"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]
@@ -30232,8 +30232,19 @@ Apply partitioned execution optimizations. This is the default when any
level of optimization is selected.
@opindex msoft-stack
+@opindex mno-soft-stack
@item -msoft-stack
-Generate code that does not use @code{.local} memory
+@itemx -mno-soft-stack
+For @option{-mno-soft-stack} (the default, unless @option{-mgomp} has
+been specified), use PTX ``native'' stacks, that is,
+generate code that uses @code{.local} memory or PTX @code{alloca}
+directly for stack storage.
+Unless @option{-mptx=7.3} or higher and @option{-march=sm_52} or
+higher are active, variable-length arrays and dynamically allocating
+memory on the stack with @code{alloca} are not supported.
+
+For @option{-msoft-stack} (implied by @option{-mgomp}),
+generate code that does not use @code{.local} memory or PTX @code{alloca}
directly for stack storage. Instead, a per-warp stack pointer is
maintained explicitly. This enables variable-length stack allocation (with
variable-length arrays or @code{alloca}), and when global memory is used for
@@ -2434,6 +2434,9 @@ nvptx code by default compiles for at least PTX ISA version 6.0.
The nvptx runtime environment supports the PTX ISA directive
@code{.alias}.
+@item nvptx_runtime_alloca_ptx
+The nvptx runtime environment supports PTX 'alloca'.
+
@item nvptx_softstack
nvptx @option{-msoft-stack} is enabled.
@end table
@@ -3359,6 +3362,9 @@ Only MIPS targets support this feature, and only then in certain modes.
@item nvptx_alias_ptx
Enable using the PTX ISA directive @code{.alias} on nvptx targets.
+@item nvptx_alloca_ptx
+Enable PTX 'alloca' on nvptx targets.
+
@item riscv_a
Add the 'A' extension to the -march string on RISC-V targets.
@@ -1,4 +1,7 @@
/* { dg-skip-if "requires alloca" { ! alloca } { "-O0" } { "" } } */
+/* See 'gcc.target/nvptx/__builtin_alloca_0-1-O0.c'.
+ { dg-xfail-if TODO { nvptx-*-* && { ! nvptx_softstack } } { "-O0" } { "" } } */
+
extern void abort (void);
extern __SIZE_TYPE__ strlen (const char *);
@@ -6,6 +6,8 @@
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
+/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */
+
void sink(void *);
void f(void)
@@ -6,6 +6,8 @@
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
+/* See 'gcc.c-torture/execute/pr36321.c', '-O0'. */
+
void sink(void *);
void f(void)
new file mode 100644
@@ -0,0 +1,28 @@
+/* Document what we do for '__builtin_stack_save()', '__builtin_stack_restore()'. */
+
+/* { dg-do assemble } */
+/* { dg-options {-O3 -mno-soft-stack} } */
+/* { dg-additional-options -march=sm_30 } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+void *p;
+
+void f(void)
+{
+ // 0xdeadbeef
+ p = __builtin_stack_save();
+ asm volatile ("" : : : "memory");
+ // no-op
+ __builtin_stack_restore(p);
+ asm volatile ("" : : : "memory");
+}
+/*
+** f:
+** \.visible \.func f
+** {
+** \.reg\.u64 (%r[0-9]+);
+** mov\.u64 \1, 3735928559;
+** st\.global\.u64 \[p\], \1;
+** ret;
+*/
@@ -2,6 +2,7 @@
/* { dg-do assemble } */
/* { dg-options {-O3 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
/* { dg-additional-options -save-temps } */
/* { dg-final { check-function-bodies {** } {} } } */
@@ -9,10 +10,8 @@ void *p;
void f(void)
{
- // 0xdeadbeef
p = __builtin_stack_save();
asm volatile ("" : : : "memory");
- // no-op
__builtin_stack_restore(p);
asm volatile ("" : : : "memory");
}
@@ -21,7 +20,10 @@ void f(void)
** \.visible \.func f
** {
** \.reg\.u64 (%r[0-9]+);
-** mov\.u64 \1, 3735928559;
+** \.reg\.u64 (%r[0-9]+);
+** stacksave\.u64 \1;
** st\.global\.u64 \[p\], \1;
+** ld\.global\.u64 \2, \[p\];
+** stackrestore\.u64 \2;
** ret;
*/
new file mode 100644
@@ -0,0 +1,49 @@
+/* { dg-do assemble } */
+/* { dg-options {-O0 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+void sink(void *);
+
+void f(void)
+{
+ sink(__builtin_alloca(123));
+}
+/*
+** f:
+** \.visible \.func f
+** {
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** mov\.u64 \11, 16;
+** add\.u64 \2, \11, -1;
+** add\.u64 \3, \2, 123;
+** div\.u64 \4, \3, 16;
+** mul\.lo\.u64 \5, \4, 16;
+** {
+** \.reg\.u64 \6_local;
+** alloca\.u64 \6_local, \5;
+** cvta\.local\.u64 \6, \6_local;
+** }
+** add\.u64 \7, \6, 15;
+** shr\.u64 \8, \7, 4;
+** shl\.b64 \9, \8, 4;
+** mov\.u64 \1, \9;
+** mov\.u64 \10, \1;
+** {
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \10;
+** call sink, \(%out_arg1\);
+** }
+** ret;
+*/
new file mode 100644
@@ -0,0 +1,33 @@
+/* { dg-do assemble } */
+/* { dg-options {-O1 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+void sink(void *);
+
+void f(void)
+{
+ sink(__builtin_alloca(123));
+}
+/*
+** f:
+** \.visible \.func f
+** {
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** {
+** \.reg\.u64 \1_local;
+** alloca\.u64 \1_local, 128;
+** cvta\.local\.u64 \1, \1_local;
+** }
+** add\.u64 \2, \1, 15;
+** and\.b64 \3, \2, -16;
+** {
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \3;
+** call sink, \(%out_arg1\);
+** }
+** ret;
+*/
similarity index 83%
rename from gcc/testsuite/gcc.target/nvptx/alloca-1.c
rename to gcc/testsuite/gcc.target/nvptx/alloca-1-sm_30.c
@@ -1,5 +1,6 @@
/* { dg-do compile } */
/* { dg-options -mno-soft-stack } */
+/* { dg-additional-options -march=sm_30 } */
void sink(void *);
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do link } */
+/* { dg-do run { target nvptx_runtime_alloca_ptx } } */
+/* { dg-options {-O0 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+
+int
+main(void)
+{
+ return !(__builtin_alloca(100) != __builtin_alloca(10));
+}
+/* { dg-final { scan-assembler-times {(?n)\talloca\.u64\t%r[0-9]+_local, %r[0-9]+;$} 2 } } */
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do assemble } */
+/* { dg-options {-O1 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+void sink(void *);
+
+void *p;
+
+void f(void)
+{
+ p = __builtin_stack_save();
+ sink(__builtin_alloca(25));
+ __builtin_stack_restore(p);
+}
+/*
+** f:
+** \.visible \.func f
+** {
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** stacksave\.u64 \1;
+** st\.global\.u64 \[p\], \1;
+** {
+** \.reg\.u64 \2_local;
+** alloca\.u64 \2_local, 32;
+** cvta\.local\.u64 \2, \2_local;
+** }
+** add\.u64 \3, \2, 15;
+** and\.b64 \4, \3, -16;
+** {
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \4;
+** call sink, \(%out_arg1\);
+** }
+** ret;
+*/
new file mode 100644
@@ -0,0 +1,55 @@
+/* { dg-do assemble } */
+/* { dg-options {-O3 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+void sink(void *);
+
+void f(void)
+{
+ void *p;
+ p = __builtin_stack_save();
+ sink(__builtin_alloca(25));
+ __builtin_stack_restore(p);
+ sink(__builtin_alloca(13));
+}
+/*
+** f:
+** .visible .func f
+** {
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** stacksave\.u64 \1;
+** {
+** \.reg\.u64 \2_local;
+** alloca\.u64 \2_local, 32;
+** cvta\.local\.u64 \2, \2_local;
+** }
+** add\.u64 \3, \2, 15;
+** and\.b64 \4, \3, -16;
+** {
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \4;
+** call sink, \(%out_arg1\);
+** }
+** stackrestore\.u64 \1;
+** {
+** \.reg\.u64 \5_local;
+** alloca\.u64 \5_local, 16;
+** cvta\.local\.u64 \5, \5_local;
+** }
+** add\.u64 \6, \5, 15;
+** and\.b64 \7, \6, -16;
+** {
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \7;
+** call sink, \(%out_arg1\);
+** }
+** ret;
+*/
new file mode 100644
@@ -0,0 +1,107 @@
+/* { dg-do link } */
+/* { dg-do run { target nvptx_runtime_alloca_ptx } } */
+/* { dg-options {-O2 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+/* See also 'gcc.target/nvptx/softstack.c'. */
+
+static __attribute__((noipa)) int f(int *p)
+{
+ return __sync_lock_test_and_set(p, 1);
+}
+/*
+** f:
+** \.func \(\.param\.u32 %value_out\) f \(\.param\.u64 %in_ar0\)
+** {
+** \.reg\.u32 %value;
+** \.reg\.u64 %ar0;
+** ld\.param\.u64 %ar0, \[%in_ar0\];
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** mov\.u64 \2, %ar0;
+** atom\.exch\.b32 \1, \[\2\], 1;
+** membar\.sys;
+** mov\.u32 %value, \1;
+** st\.param\.u32 \[%value_out\], %value;
+** ret;
+*/
+
+static __attribute__((noipa)) int g(int n)
+{
+ /* Check that variable-length stack allocation works. */
+ int v[n];
+ v[0] = 0;
+ /* Check that atomic operations can be applied to auto data. */
+ return f(v) == 0 && v[0] == 1;
+}
+/*
+** g:
+** \.func \(\.param\.u32 %value_out\) g \(\.param\.u32 %in_ar0\)
+** {
+** \.reg\.u32 %value;
+** \.reg\.u32 %ar0;
+** ld\.param\.u32 %ar0, \[%in_ar0\];
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.pred (%r[0-9]+);
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.pred (%r[0-9]+);
+** mov\.u32 \2, %ar0;
+** cvt\.s64\.s32 \3, \2;
+** shl\.b64 \4, \3, 2;
+** add\.u64 \5, \4, 15;
+** and\.b64 \6, \5, -16;
+** {
+** \.reg\.u64 \7_local;
+** alloca\.u64 \7_local, \6;
+** cvta\.local\.u64 \7, \7_local;
+** }
+** add\.u64 \8, \7, 3;
+** and\.b64 \9, \8, -4;
+** mov\.u32 \10, 0;
+** st\.u32 \[\9\], \10;
+** {
+** \.param\.u32 %value_in;
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \9;
+** call \(%value_in\), f, \(%out_arg1\);
+** ld\.param\.u32 \11, \[%value_in\];
+** }
+** setp\.ne\.u32 \12, \11, 0;
+** @\12 bra (\$L[0-9]+);
+** ld\.u32 \13, \[\9\];
+** setp\.eq\.u32 \14, \13, 1;
+** selp\.u32 \1, 1, 0, \14;
+** bra (\$L[0-9]+);
+** \15:
+** mov\.u32 \1, \10;
+** \16:
+** mov\.u32 %value, \1;
+** st\.param\.u32 \[%value_out\], %value;
+** ret;
+*/
+
+int main()
+{
+ if (!g(1))
+ __builtin_abort();
+ return 0;
+}
+
+/* PTX 'atom' isn't acceptable for '.local' memory:
+ 'operation not supported on global/shared address space' [sic]
+ ('CUDA_ERROR_INVALID_ADDRESS_SPACE'), thus FAILs for 'alloca'ed memory.
+ We'd have to use the 'nvptx_mem_local_p' replacements, but currently lack a
+ mechanism for doing so (TODO).
+ { dg-xfail-run-if TODO { *-*-* } } */
@@ -1,6 +1,8 @@
/* { dg-options "-O2 -msoft-stack" } */
/* { dg-do run } */
+/* See also 'gcc.target/nvptx/alloca-5.c'. */
+
static __attribute__((noinline,noclone)) int f(int *p)
{
return __sync_lock_test_and_set(p, 1);
new file mode 100644
@@ -0,0 +1,29 @@
+/* { dg-do assemble } */
+/* { dg-options {-O0 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {**} {} } } */
+
+void sink(void *);
+
+void f(int s)
+{
+ char a[s];
+ sink(a);
+}
+/*
+** f:
+** ...
+** cvt\.s64\.s32 (%r[0-9]+), (%r[0-9]+);
+** mov\.u64 (%r[0-9]+), 16;
+** add\.u64 (%r[0-9]+), \3, -1;
+** add\.u64 (%r[0-9]+), \1, \4;
+** div\.u64 (%r[0-9]+), \5, 16;
+** mul\.lo\.u64 (%r[0-9]+), \6, 16;
+** {
+** \.reg\.u64 (%r[0-9]+)_local;
+** alloca\.u64 \8_local, \7;
+** cvta\.local\.u64 \8, \8_local;
+** }
+** ...
+*/
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do assemble } */
+/* { dg-options {-O1 -mno-soft-stack} } */
+/* { dg-add-options nvptx_alloca_ptx } */
+/* { dg-additional-options -save-temps } */
+/* { dg-final { check-function-bodies {** } {} } } */
+
+void sink(void *);
+
+void f(int s)
+{
+ char a[s];
+ sink(a);
+}
+/*
+** f:
+** \.visible \.func f \(\.param\.u32 %in_ar0\)
+** {
+** \.reg\.u32 %ar0;
+** ld\.param\.u32 %ar0, \[%in_ar0\];
+** \.reg\.u32 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** \.reg\.u64 (%r[0-9]+);
+** mov\.u32 \1, %ar0;
+** cvt\.s64\.s32 \2, \1;
+** add\.u64 \3, \2, 15;
+** and\.b64 \4, \3, -16;
+** {
+** \.reg\.u64 \5_local;
+** alloca\.u64 \5_local, \4;
+** cvta\.local\.u64 \5, \5_local;
+** }
+** {
+** \.param\.u64 %out_arg1;
+** st\.param\.u64 \[%out_arg1\], \5;
+** call sink, \(%out_arg1\);
+** }
+** ret;
+*/
similarity index 83%
rename from gcc/testsuite/gcc.target/nvptx/vla-1.c
rename to gcc/testsuite/gcc.target/nvptx/vla-1-sm_30.c
@@ -1,5 +1,6 @@
/* { dg-do compile } */
/* { dg-options -mno-soft-stack } */
+/* { dg-additional-options -march=sm_30 } */
void sink(void *);
@@ -1009,9 +1009,37 @@ proc check_effective_target_alloca {} {
return 0
}
if { [istarget nvptx-*-*] } {
+ # For nvptx, 'alloca' support depends on the configuration. In case
+ # of PTX "native" stacks, for 'dg-do run', it additionally depends on
+ # runtime support.
if { ![check_effective_target_nvptx_softstack] } {
- return 0
+ # '-mno-soft-stack': PTX "native" stacks
+
+ # Not supported unless '-mptx=7.3'+ and '-march=sm_52'+.
+ if { !([check_nvptx_default_ptx_isa_version_at_least 7 3]
+ && [check_nvptx_default_ptx_isa_target_architecture_at_least sm_52]) } {
+ return 0
+ }
+
+ # Find 'dg-do-what' in an outer frame.
+ set level 1
+ while true {
+ upvar $level dg-do-what dg-do-what
+ if [info exists dg-do-what] then break
+ incr level
+ }
+ verbose "check_effective_target_alloca: found dg-do-what at level $level" 2
+
+ if { [string equal [lindex ${dg-do-what} 0] run] } {
+ # For 'dg-do run', it additionally depends on runtime support.
+ # (If not supported, we don't try to demote 'run' to 'link',
+ # but instead simply fail the effective-target 'alloca' check.)
+ return [check_effective_target_nvptx_runtime_alloca_ptx]
+ } else {
+ return 1
+ }
} else {
+ # '-msoft-stack'
return 1
}
}
@@ -14100,6 +14128,35 @@ proc check_effective_target_nvptx_default_ptx_isa_version_at_least_6_0 { } {
return [check_nvptx_default_ptx_isa_version_at_least 6 0]
}
+# Return 1 if nvptx code by default compiles for at least the specified PTX ISA
+# target architecture.
+
+proc check_nvptx_default_ptx_isa_target_architecture_at_least { ta } {
+ set name nvptx_default_ptx_isa_target_architecture_at_least_${ta}
+
+ if [regexp {^sm_(\d+)$} $ta dummy ptx_sm] {
+ set ptx_sm "${ptx_sm}0"
+ } else {
+ error "check_nvptx_default_ptx_isa_target_architecture_at_least: illegal argument: $ta"
+ }
+
+ set supported_p \
+ [concat \
+ "(__PTX_SM__ >= $ptx_sm)"]
+
+ set src \
+ [list \
+ "#if $supported_p" \
+ "#else" \
+ "#error unsupported" \
+ "#endif"]
+ set src [join $src "\n"]
+
+ set res [check_no_compiler_messages $name assembly $src ""]
+
+ return $res
+}
+
# Return 1 if nvptx '-msoft-stack' is enabled.
proc check_effective_target_nvptx_softstack { } {
@@ -14132,6 +14189,28 @@ proc check_nvptx_runtime_ptx_isa_version_at_least { major minor } {
return $res
}
+# Return 1 if nvptx code with the specified PTX ISA target architecture or
+# higher can be run.
+
+proc check_nvptx_runtime_ptx_isa_target_architecture_at_least { ta } {
+ set name nvptx_runtime_ptx_isa_target_architecture_${ta}
+
+ set default \
+ [check_nvptx_default_ptx_isa_target_architecture_at_least ${ta}]
+
+ if { $default } {
+ set flag ""
+ } else {
+ set flag "-march=$ta -mptx=_"
+ }
+
+ set res [check_runtime $name {
+ int main (void) { return 0; }
+ } $flag]
+
+ return $res
+}
+
# Return 1 if the nvptx runtime environment supports the PTX ISA directive
# '.alias'.
@@ -14139,6 +14218,13 @@ proc check_effective_target_nvptx_runtime_alias_ptx { } {
return [check_nvptx_runtime_ptx_isa_version_at_least 6 3]
}
+# Return 1 if the nvptx runtime environment supports PTX 'alloca'.
+
+proc check_effective_target_nvptx_runtime_alloca_ptx { } {
+ return [expr { [check_nvptx_runtime_ptx_isa_version_at_least 7 3]
+ && [check_nvptx_runtime_ptx_isa_target_architecture_at_least sm_52] }]
+}
+
# Add options to enable nvptx using the PTX ISA directive '.alias'.
proc add_options_for_nvptx_alias_ptx { flags } {
@@ -14150,3 +14236,20 @@ proc add_options_for_nvptx_alias_ptx { flags } {
return $flags
}
+
+# Add options to enable nvptx using PTX 'alloca'.
+
+proc add_options_for_nvptx_alloca_ptx { flags } {
+ # We don't add '-mno-soft-stack' here; the users should take care of that
+ # explicitly.
+
+ if { ![check_nvptx_default_ptx_isa_version_at_least 7 3] } {
+ append flags " -mptx=7.3"
+ }
+
+ if { ![check_nvptx_default_ptx_isa_target_architecture_at_least sm_52] } {
+ append flags " -march=sm_52"
+ }
+
+ return $flags
+}
@@ -846,8 +846,8 @@ omp_get_device_from_uid_ (const char *uid, size_t uid_len)
/* Inside the target region, invoking this routine is undefined
behavior; thus, resolve it already here - instead of inside
libgomp/config/.../target.c.
- Note that on nvptx __builtin_alloca is defined, but fails with a sorry
- during compilation, as it is unsupported until isa 7.3 / sm_52. */
+ This also circumvents issues due to not all nvptx configurations
+ supporting 'alloca'. */
return omp_invalid_device;
#endif
}
@@ -1,12 +1,5 @@
! { dg-do run }
-! PR65181 "Support for alloca in nvptx"
-! { dg-excess-errors "lto1, mkoffload and lto-wrapper fatal errors" { target openacc_nvidia_accel_selected } }
-! Aside from restricting this testcase to non-nvptx offloading, and duplicating
-! it with 'dg-do link' for nvptx offloading, there doesn't seem to be a way to
-! XFAIL the "UNRESOLVED: [...] compilation failed to produce executable", or
-! get rid of it, unfortunately.
-
! { dg-additional-options "-fopt-info-note-omp" }
! { dg-additional-options "--param=openacc-privatization=noisy" }
! { dg-additional-options "-foffload=-fopt-info-note-omp" }
@@ -59,7 +52,6 @@ contains
! { dg-note {variable 'array' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
- ! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
do i = 1, 10
array(i) = i
end do
@@ -91,7 +83,6 @@ contains
! { dg-note {variable 'array\.[0-9]+' in 'private' clause is candidate for adjusting OpenACC privatization level} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array\.[0-9]+' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'array\.[0-9]+' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
- ! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
do i = 1, 10
array(i) = 9*i
end do
@@ -117,7 +108,6 @@ contains
! { dg-note {variable 'str' ought to be adjusted for OpenACC privatization level: 'gang'} "" { target *-*-* } l_loop$c_loop }
! { dg-note {variable 'str' adjusted for OpenACC privatization level: 'gang'} "" { target { ! { openacc_host_selected || { openacc_nvidia_accel_selected && __OPTIMIZE__ } } } } l_loop$c_loop }
! { dg-note {variable 'char\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: artificial} "" { target *-*-* } l_loop$c_loop }
- ! { dg-message {sorry, unimplemented: target cannot support alloca} PR65181 { target openacc_nvidia_accel_selected } l_loop$c_loop }
do i = 1, 10
str(i:i) = achar(ichar('A') + i)
end do
--
2.34.1