@@ -75,9 +75,15 @@ extern unsigned int gcn_local_sym_hash (const char *name);
supported for gcn. */
#define GOMP_SELF_SPECS ""
+#define DRIVER_SELF_SPECS \
+ "%{march=fiji|march=gfx900|march=gfx906:%{!msram-ecc=*:-msram-ecc=off}}"
+
/* Use LLVM assembler and linker options. */
#define ASM_SPEC "-triple=amdgcn--amdhsa " \
"%:last_arg(%{march=*:-mcpu=%*}) " \
+ "-mattr=%{mxnack:+xnack;:-xnack} " \
+ /* FIXME: support "any" when we move to HSACOv4. */ \
+ "-mattr=%{!msram-ecc=off:+sram-ecc;:-sram-ecc} " \
"-filetype=obj"
#define LINK_SPEC "--pie --export-dynamic"
#define LIB_SPEC "-lc"
@@ -34,4 +34,11 @@ extern int gcn_isa;
#define TARGET_GCN5 (gcn_isa == 5)
#define TARGET_GCN5_PLUS (gcn_isa >= 5)
+enum sram_ecc_type
+{
+ SRAM_ECC_OFF,
+ SRAM_ECC_ON,
+ SRAM_ECC_ANY
+};
+
#endif
@@ -703,6 +703,8 @@ (define_expand "vec_init<mode><scalar_mode>"
;; - The address space and glc (volatile) fields are there to replace the
;; fields normally found in a MEM.
;; - Multiple forms of address expression are supported, below.
+;;
+;; TODO: implement combined gather and zero_extend, but only for -msram-ecc=on
(define_expand "gather_load<mode><vnsi>"
[(match_operand:V_ALL 0 "register_operand")
@@ -144,6 +144,10 @@ gcn_option_override (void)
/* 1MB total. */
stack_size_opt = 1048576;
}
+
+ /* The xnack option is a placeholder, for now. */
+ if (flag_xnack)
+ sorry ("XNACK support");
}
/* }}} */
@@ -5182,11 +5186,16 @@ output_file_start (void)
case PROCESSOR_FIJI: cpu = "gfx803"; break;
case PROCESSOR_VEGA10: cpu = "gfx900"; break;
case PROCESSOR_VEGA20: cpu = "gfx906"; break;
- case PROCESSOR_GFX908: cpu = "gfx908+sram-ecc"; break;
+ case PROCESSOR_GFX908: cpu = "gfx908"; break;
default: gcc_unreachable ();
}
- fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s\"\n", cpu);
+ const char *xnack = (flag_xnack ? "+xnack" : "");
+ /* FIXME: support "any" when we move to HSACOv4. */
+ const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
+
+ fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
+ cpu, xnack, sram_ecc);
}
/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
@@ -569,6 +569,7 @@ (define_insn "*mov<mode>_insn"
(set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
; 8/16bit move pattern
+; TODO: implement combined load and zero_extend, but *only* for -msram-ecc=on
(define_insn "*mov<mode>_insn"
[(set (match_operand:QIHI 0 "nonimmediate_operand"
@@ -76,3 +76,24 @@ Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
Wopenacc-dims
Target Var(warn_openacc_dims) Warning
Warn about invalid OpenACC dimensions.
+
+mxnack
+Target Var(flag_xnack) Init(0)
+Compile for devices requiring XNACK enabled. Default off.
+
+Enum
+Name(sram_ecc_type) Type(enum sram_ecc_type)
+SRAM-ECC modes:
+
+EnumValue
+Enum(sram_ecc_type) String(off) Value(SRAM_ECC_OFF)
+
+EnumValue
+Enum(sram_ecc_type) String(on) Value(SRAM_ECC_ON)
+
+EnumValue
+Enum(sram_ecc_type) String(any) Value(SRAM_ECC_ANY)
+
+msram-ecc=
+Target RejectNegative Joined ToLower Enum(sram_ecc_type) Var(flag_sram_ecc) Init(SRAM_ECC_ANY)
+Compile for devices with the SRAM ECC feature enabled, or not. Default \"any\".
@@ -52,7 +52,10 @@
#undef EF_AMDGPU_MACH_AMDGCN_GFX906
#define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f
#undef EF_AMDGPU_MACH_AMDGCN_GFX908
-#define EF_AMDGPU_MACH_AMDGCN_GFX908 0x230 // Assume SRAM-ECC enabled.
+#define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30
+
+#define EF_AMDGPU_XNACK 0x100
+#define EF_AMDGPU_SRAM_ECC 0x200
#ifndef R_AMDGPU_NONE
#define R_AMDGPU_NONE 0
@@ -77,6 +80,7 @@ static struct obstack files_to_cleanup;
enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803; // Default GPU architecture.
+uint32_t elf_flags = 0;
/* Delete tempfiles. */
@@ -298,7 +302,7 @@ copy_early_debug_info (const char *infile, const char *outfile)
ehdr.e_ident[8] = ELFABIVERSION_AMDGPU_HSA;
ehdr.e_type = ET_REL;
ehdr.e_machine = EM_AMDGPU;
- ehdr.e_flags = elf_arch;
+ ehdr.e_flags = elf_arch | elf_flags;
/* Load the section headers so we can walk them later. */
Elf64_Shdr *sections = (Elf64_Shdr *)xmalloc (sizeof (Elf64_Shdr)
@@ -823,6 +827,7 @@ main (int argc, char **argv)
bool fopenacc = false;
bool fPIC = false;
bool fpic = false;
+ bool sram_seen = false;
for (int i = 1; i < argc; i++)
{
#define STR "-foffload-abi="
@@ -845,6 +850,26 @@ main (int argc, char **argv)
fPIC = true;
else if (strcmp (argv[i], "-fpic") == 0)
fpic = true;
+ else if (strcmp (argv[i], "-mxnack") == 0)
+ elf_flags |= EF_AMDGPU_XNACK;
+ else if (strcmp (argv[i], "-mno-xnack") == 0)
+ elf_flags &= ~EF_AMDGPU_XNACK;
+ else if (strcmp (argv[i], "-msram-ecc=on") == 0)
+ {
+ elf_flags |= EF_AMDGPU_SRAM_ECC;
+ sram_seen = true;
+ }
+ else if (strcmp (argv[i], "-msram-ecc=any") == 0)
+ {
+ /* FIXME: change this when we move to HSACOv4. */
+ elf_flags |= EF_AMDGPU_SRAM_ECC;
+ sram_seen = true;
+ }
+ else if (strcmp (argv[i], "-msram-ecc=off") == 0)
+ {
+ elf_flags &= ~EF_AMDGPU_SRAM_ECC;
+ sram_seen = true;
+ }
else if (strcmp (argv[i], "-save-temps") == 0)
save_temps = true;
else if (strcmp (argv[i], "-v") == 0)
@@ -865,6 +890,21 @@ main (int argc, char **argv)
if (!(fopenacc ^ fopenmp))
fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
+ /* The SRAM-ECC feature defaults to "any" on GPUs where the feature is
+ available. */
+ if (!sram_seen)
+ switch (elf_arch)
+ {
+ case EF_AMDGPU_MACH_AMDGCN_GFX803:
+ case EF_AMDGPU_MACH_AMDGCN_GFX900:
+ case EF_AMDGPU_MACH_AMDGCN_GFX906:
+ break;
+ default:
+ /* FIXME: change this when we move to HSACOv4. */
+ elf_flags |= EF_AMDGPU_SRAM_ECC;
+ break;
+ }
+
const char *abi;
switch (offload_abi)
{
@@ -892,6 +932,12 @@ main (int argc, char **argv)
obstack_ptr_grow (&cc_argv_obstack, "-xlto");
if (fopenmp)
obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
+ obstack_ptr_grow (&cc_argv_obstack,
+ (elf_flags & EF_AMDGPU_XNACK
+ ? "-mxnack" : "-mno-xnack"));
+ obstack_ptr_grow (&cc_argv_obstack,
+ (elf_flags & EF_AMDGPU_SRAM_ECC
+ ? "-msram-ecc=on" : "-msram-ecc=off"));
for (int ix = 1; ix != argc; ix++)
{
@@ -993,6 +1039,14 @@ main (int argc, char **argv)
}
obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
+ obstack_ptr_grow (&ld_argv_obstack,
+ (elf_flags & EF_AMDGPU_XNACK
+ ? "-mxnack" : "-mno-xnack"));
+ obstack_ptr_grow (&ld_argv_obstack,
+ (elf_flags & EF_AMDGPU_SRAM_ECC
+ ? "-msram-ecc=on" : "-msram-ecc=off"));
+ if (verbose)
+ obstack_ptr_grow (&ld_argv_obstack, "-v");
for (int i = 1; i < argc; i++)
if (startswith (argv[i], "-l")
@@ -18847,6 +18847,15 @@ Compile for GCN5 Vega 20 devices (gfx906).
@end table
+@item -msram-ecc=on
+@itemx -msram-ecc=off
+@itemx -msram-ecc=any
+@opindex msram-ecc
+Compile binaries suitable for devices with the SRAM-ECC feature enabled,
+disabled, or either mode. This feature can be enabled per-process on some
+devices. The compiled code must match the device mode. The default is
+@samp{any}, for devices that support it.
+
@item -mstack-size=@var{bytes}
@opindex mstack-size
Specify how many @var{bytes} of stack space will be requested for each GPU
@@ -18855,6 +18864,14 @@ available. The size of the stack allocation may also have an impact on
run-time performance. The default is 32KB when using OpenACC or OpenMP, and
1MB otherwise.
+@item -mxnack
+@opindex mxnack
+Compile binaries suitable for devices with the XNACK feature enabled. Some
+devices always require XNACK and some allow the user to configure XNACK. The
+compiled code must match the device mode. The default is @samp{-mno-xnack}.
+At present this option is a placeholder for support that is not yet
+implemented.
+
@end table
@node ARC Options
new file mode 100644
@@ -0,0 +1,17 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets without sram-ecc enabled (in which sub-dword loads do not
+ zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -msram-ecc=off" } */
+
+extern unsigned char c;
+
+unsigned int
+f ()
+{
+ return c;
+}
+
+/* { dg-final { scan-assembler "lshl.* 24" } } */
+/* { dg-final { scan-assembler "lshr.* 24" } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets without sram-ecc enabled (in which sub-dword loads do not
+ zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -msram-ecc=off" } */
+
+extern unsigned short s;
+
+unsigned short
+f ()
+{
+ return s;
+}
+
+/* { dg-final { scan-assembler "lshl.* 16" } } */
+/* { dg-final { scan-assembler "lshr.* 16" } } */
new file mode 100644
@@ -0,0 +1,21 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets without sram-ecc enabled (in which sub-dword loads do not
+ zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -dp -msram-ecc=off" } */
+
+typedef unsigned int v64si __attribute__ ((vector_size (64*4)));
+typedef unsigned char v64qi __attribute__ ((vector_size (64*1)));
+
+extern v64si a;
+extern v64qi b;
+
+void
+f ()
+{
+ for (int n = 0; n < 64; n++)
+ a[n] = b[n];
+}
+
+/* { dg-final { scan-assembler "zero_extendv64qiv64si2" } } */
new file mode 100644
@@ -0,0 +1,21 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets without sram-ecc enabled (in which sub-dword loads do not
+ zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -dp -msram-ecc=off" } */
+
+typedef unsigned int v64si __attribute__ ((vector_size (64*4)));
+typedef unsigned short v64hi __attribute__ ((vector_size (64*2)));
+
+extern v64si a;
+extern v64hi b;
+
+void
+f ()
+{
+ for (int n = 0; n < 64; n++)
+ a[n] = b[n];
+}
+
+/* { dg-final { scan-assembler "zero_extendv64hiv64si2" } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets that may not have sram-ecc enabled (in which sub-dword loads do
+ not zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -msram-ecc=any" } */
+
+extern unsigned char c;
+
+unsigned int
+f ()
+{
+ return c;
+}
+
+/* { dg-final { scan-assembler "lshl.* 24" } } */
+/* { dg-final { scan-assembler "lshr.* 24" } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets that may not have sram-ecc enabled (in which sub-dword loads do
+ not zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -msram-ecc=any" } */
+
+extern unsigned short s;
+
+unsigned short
+f ()
+{
+ return s;
+}
+
+/* { dg-final { scan-assembler "lshl.* 16" } } */
+/* { dg-final { scan-assembler "lshr.* 16" } } */
new file mode 100644
@@ -0,0 +1,21 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets that may not have sram-ecc enabled (in which sub-dword loads do
+ not zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -dp -msram-ecc=any" } */
+
+typedef unsigned int v64si __attribute__ ((vector_size (64*4)));
+typedef unsigned char v64qi __attribute__ ((vector_size (64*1)));
+
+extern v64si a;
+extern v64qi b;
+
+void
+f ()
+{
+ for (int n = 0; n < 64; n++)
+ a[n] = b[n];
+}
+
+/* { dg-final { scan-assembler "zero_extendv64qiv64si2" } } */
new file mode 100644
@@ -0,0 +1,21 @@
+/* Ensure that explicit zero-entend instructions are present when compiling
+ for targets that may not have sram-ecc enabled (in which sub-dword loads do
+ not zero the high bits of the target register). */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -dp -msram-ecc=any" } */
+
+typedef unsigned int v64si __attribute__ ((vector_size (64*4)));
+typedef unsigned short v64hi __attribute__ ((vector_size (64*2)));
+
+extern v64si a;
+extern v64hi b;
+
+void
+f ()
+{
+ for (int n = 0; n < 64; n++)
+ a[n] = b[n];
+}
+
+/* { dg-final { scan-assembler "zero_extendv64hiv64si2" } } */