new file mode 100644
@@ -0,0 +1,6161 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* {{{ Includes. */
+
+/* We want GET_MODE_SIZE et al to return integers, please. */
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "memmodel.h"
+#include "rtl.h"
+#include "tree.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "insn-attr.h"
+#include "fold-const.h"
+#include "calls.h"
+#include "explow.h"
+#include "expr.h"
+#include "output.h"
+#include "cfgrtl.h"
+#include "langhooks.h"
+#include "builtins.h"
+#include "omp-general.h"
+#include "print-rtl.h"
+#include "attribs.h"
+#include "varasm.h"
+
+/* This file should be included last. */
+#include "target-def.h"
+
+/* }}} */
+/* {{{ Global variables. */
+
+/* Constants used by FP instructions. */
+
+static REAL_VALUE_TYPE dconst4, dconst1over2pi;
+static bool ext_gcn_constants_init = 0;
+
+/* Holds the ISA variant, derived from the command line parameters. */
+
+int gcn_isa = 3; /* Default to GCN3. */
+
+/* Reserve this much space for LDS (for propagating variables from
+ worker-single mode to worker-partitioned mode), per workgroup. Global
+ analysis could calculate an exact bound, but we don't do that yet. */
+
+#define LDS_SIZE 32768
+
+/* }}} */
+/* {{{ Initialization and options. */
+
+/* Initialize machine_function. */
+
+static struct machine_function *
+gcn_init_machine_status (void)
+{
+ struct machine_function *f;
+
+ f = ggc_cleared_alloc<machine_function> ();
+
+ /* Set up LDS allocation for broadcasting for this function. */
+ f->lds_allocated = 32;
+ f->lds_allocs = hash_map<tree, int>::create_ggc (64);
+
+ /* And LDS temporary decls for worker reductions. */
+ vec_alloc (f->reduc_decls, 0);
+
+ if (TARGET_GCN3)
+ f->use_flat_addressing = true;
+
+ return f;
+}
+
+/* Implement TARGET_OPTION_OVERRIDE.
+
+ Override option settings where defaults are variable, or we have specific
+ needs to consider. */
+
+static void
+gcn_option_override (void)
+{
+ init_machine_status = gcn_init_machine_status;
+
+ /* The HSA runtime does not respect ELF load addresses, so force PIE. */
+ if (!flag_pie)
+ flag_pie = 2;
+ if (!flag_pic)
+ flag_pic = flag_pie;
+
+ /* Disable debug info, for now. */
+ debug_info_level = DINFO_LEVEL_NONE;
+
+ gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
+
+ /* The default stack size needs to be small for offload kernels because
+ there may be many, many threads. But, a small stack is insufficient
+ for running the testsuite, so we use a larger default for the
+ stand alone case. */
+ if (stack_size_opt == -1)
+ {
+ if (flag_openmp || flag_openacc)
+ /* 1280 bytes per work item = 80kB total. */
+ stack_size_opt = 1280 * 64;
+ else
+ /* 1MB total. */
+ stack_size_opt = 1048576;
+ }
+}
+
+/* }}} */
+/* {{{ Attributes. */
+
+/* This table defines the arguments that are permitted in
+ __attribute__ ((amdgpu_hsa_kernel (...))).
+
+ The names and values correspond to the HSA metadata that is encoded
+ into the assembler file and binary. */
+
+static const struct gcn_kernel_arg_type
+{
+ const char *name;
+ const char *header_pseudo;
+ machine_mode mode;
+
+ /* This should be set to -1 or -2 for a dynamically allocated register
+ number. Use -1 if this argument contributes to the user_sgpr_count,
+ -2 otherwise. */
+ int fixed_regno;
+} gcn_kernel_arg_types[] = {
+ {"exec", NULL, DImode, EXEC_REG},
+#define PRIVATE_SEGMENT_BUFFER_ARG 1
+ {"private_segment_buffer",
+ "enable_sgpr_private_segment_buffer", TImode, -1},
+#define DISPATCH_PTR_ARG 2
+ {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
+#define QUEUE_PTR_ARG 3
+ {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
+#define KERNARG_SEGMENT_PTR_ARG 4
+ {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
+ {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
+#define FLAT_SCRATCH_INIT_ARG 6
+ {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
+#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
+ {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
+ {"grid_workgroup_count_X",
+ "enable_sgpr_grid_workgroup_count_x", SImode, -1},
+ {"grid_workgroup_count_Y",
+ "enable_sgpr_grid_workgroup_count_y", SImode, -1},
+ {"grid_workgroup_count_Z",
+ "enable_sgpr_grid_workgroup_count_z", SImode, -1},
+#define WORKGROUP_ID_X_ARG 11
+ {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
+ {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
+ {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
+ {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
+#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
+ {"private_segment_wave_offset",
+ "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
+#define WORK_ITEM_ID_X_ARG 16
+ {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
+#define WORK_ITEM_ID_Y_ARG 17
+ {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
+#define WORK_ITEM_ID_Z_ARG 18
+ {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
+};
+
+/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
+ This function also sets the default values for some arguments.
+
+ Return true on success, with ARGS populated. */
+
+static bool
+gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
+ tree list)
+{
+ bool err = false;
+ args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG)
+ | (1 << QUEUE_PTR_ARG)
+ | (1 << KERNARG_SEGMENT_PTR_ARG)
+ | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG));
+ args->nargs = 0;
+
+ for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
+ args->reg[a] = -1;
+
+ for (; list; list = TREE_CHAIN (list))
+ {
+ const char *str;
+ if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
+ {
+ error ("amdgpu_hsa_kernel attribute requires string constant "
+ "arguments");
+ break;
+ }
+ str = TREE_STRING_POINTER (TREE_VALUE (list));
+ int a;
+ for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
+ {
+ if (!strcmp (str, gcn_kernel_arg_types[a].name))
+ break;
+ }
+ if (a == GCN_KERNEL_ARG_TYPES)
+ {
+ error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str);
+ err = true;
+ break;
+ }
+ if (args->requested & (1 << a))
+ {
+ error ("duplicated parameter specifier %s in amdgpu_hsa_kernel "
+ "attribute", str);
+ err = true;
+ break;
+ }
+ args->requested |= (1 << a);
+ args->order[args->nargs++] = a;
+ }
+ args->requested |= (1 << WORKGROUP_ID_X_ARG);
+ args->requested |= (1 << WORK_ITEM_ID_Z_ARG);
+
+ /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
+ WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
+ requesting WORK_ITEM_ID_X_ARG. */
+ if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
+ args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
+ if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
+ args->requested |= (1 << WORK_ITEM_ID_X_ARG);
+
+ /* Always enable this so that kernargs is in a predictable place for
+ gomp_print, etc. */
+ args->requested |= (1 << DISPATCH_PTR_ARG);
+
+ int sgpr_regno = FIRST_SGPR_REG;
+ args->nsgprs = 0;
+ for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
+ {
+ if (!(args->requested & (1 << a)))
+ continue;
+
+ if (gcn_kernel_arg_types[a].fixed_regno >= 0)
+ args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
+ else
+ {
+ int reg_count;
+
+ switch (gcn_kernel_arg_types[a].mode)
+ {
+ case E_SImode:
+ reg_count = 1;
+ break;
+ case E_DImode:
+ reg_count = 2;
+ break;
+ case E_TImode:
+ reg_count = 4;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ args->reg[a] = sgpr_regno;
+ sgpr_regno += reg_count;
+ if (gcn_kernel_arg_types[a].fixed_regno == -1)
+ args->nsgprs += reg_count;
+ }
+ }
+ if (sgpr_regno > FIRST_SGPR_REG + 16)
+ {
+ error ("too many arguments passed in sgpr registers");
+ }
+ return err;
+}
+
+/* Referenced by TARGET_ATTRIBUTE_TABLE.
+
+ Validates target specific attributes. */
+
+static tree
+gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
+ tree args, int, bool *no_add_attrs)
+{
+ if (TREE_CODE (*node) != FUNCTION_TYPE
+ && TREE_CODE (*node) != METHOD_TYPE
+ && TREE_CODE (*node) != METHOD_TYPE
+ && TREE_CODE (*node) != FIELD_DECL
+ && TREE_CODE (*node) != TYPE_DECL)
+ {
+ warning (OPT_Wattributes, "%qE attribute only applies to functions",
+ name);
+ *no_add_attrs = true;
+ return NULL_TREE;
+ }
+
+ /* Can combine regparm with all attributes but fastcall, and thiscall. */
+ if (is_attribute_p ("gcnhsa_kernel", name))
+ {
+ struct gcn_kernel_args kernelarg;
+
+ if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
+ *no_add_attrs = true;
+
+ return NULL_TREE;
+ }
+
+ return NULL_TREE;
+}
+
+/* Implement TARGET_ATTRIBUTE_TABLE.
+
+ Create target-specific __attribute__ types. */
+
+static const struct attribute_spec gcn_attribute_table[] = {
+ /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
+ affects_type_identity } */
+ {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
+ true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
+ /* End element. */
+ {NULL, 0, 0, false, false, false, false, NULL, NULL}
+};
+
+/* }}} */
+/* {{{ Registers and modes. */
+
+/* Implement TARGET_CLASS_MAX_NREGS.
+
+ Return the number of hard registers needed to hold a value of MODE in
+ a register of class RCLASS. */
+
+static unsigned char
+gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
+{
+ /* Scalar registers are 32bit, vector registers are in fact tuples of
+ 64 lanes. */
+ if (rclass == VGPR_REGS)
+ {
+ if (vgpr_1reg_mode_p (mode))
+ return 1;
+ if (vgpr_2reg_mode_p (mode))
+ return 2;
+ /* TImode is used by DImode compare_and_swap. */
+ if (mode == TImode)
+ return 4;
+ }
+ return CEIL (GET_MODE_SIZE (mode), 4);
+}
+
+/* Implement TARGET_HARD_REGNO_NREGS.
+
+ Return the number of hard registers needed to hold a value of MODE in
+ REGNO. */
+
+unsigned int
+gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
+{
+ return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
+}
+
+/* Implement TARGET_HARD_REGNO_MODE_OK.
+
+ Return true if REGNO can hold value in MODE. */
+
+bool
+gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+{
+ /* Treat a complex mode as if it were a scalar mode of the same overall
+ size for the purposes of allocating hard registers. */
+ if (COMPLEX_MODE_P (mode))
+ switch (mode)
+ {
+ case E_CQImode:
+ case E_CHImode:
+ mode = SImode;
+ break;
+ case E_CSImode:
+ mode = DImode;
+ break;
+ case E_CDImode:
+ mode = TImode;
+ break;
+ case E_HCmode:
+ mode = SFmode;
+ break;
+ case E_SCmode:
+ mode = DFmode;
+ break;
+ default:
+ /* Not supported. */
+ return false;
+ }
+
+ switch (regno)
+ {
+ case FLAT_SCRATCH_LO_REG:
+ case XNACK_MASK_LO_REG:
+ case TBA_LO_REG:
+ case TMA_LO_REG:
+ return (mode == SImode || mode == DImode);
+ case VCC_LO_REG:
+ case EXEC_LO_REG:
+ return (mode == BImode || mode == SImode || mode == DImode
+ /*|| mode == V32BImode || mode == V64BImode */);
+ case M0_REG:
+ case FLAT_SCRATCH_HI_REG:
+ case XNACK_MASK_HI_REG:
+ case TBA_HI_REG:
+ case TMA_HI_REG:
+ return mode == SImode;
+ case VCC_HI_REG:
+ return false;
+ case EXEC_HI_REG:
+ return mode == SImode /*|| mode == V32BImode */ ;
+ case SCC_REG:
+ case VCCZ_REG:
+ case EXECZ_REG:
+ return mode == BImode;
+ }
+ if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
+ return true;
+ if (SGPR_REGNO_P (regno))
+ /* We restrict double register values to aligned registers. */
+ return (sgpr_1reg_mode_p (mode)
+ || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
+ || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
+ if (VGPR_REGNO_P (regno))
+ return (vgpr_1reg_mode_p (mode) || vgpr_2reg_mode_p (mode)
+ /* TImode is used by DImode compare_and_swap. */
+ || mode == TImode);
+ return false;
+}
+
+/* Implement REGNO_REG_CLASS via gcn.h.
+
+ Return smallest class containing REGNO. */
+
+enum reg_class
+gcn_regno_reg_class (int regno)
+{
+ switch (regno)
+ {
+ case SCC_REG:
+ return SCC_CONDITIONAL_REG;
+ case VCCZ_REG:
+ return VCCZ_CONDITIONAL_REG;
+ case EXECZ_REG:
+ return EXECZ_CONDITIONAL_REG;
+ case EXEC_LO_REG:
+ case EXEC_HI_REG:
+ return EXEC_MASK_REG;
+ }
+ if (VGPR_REGNO_P (regno))
+ return VGPR_REGS;
+ if (SGPR_REGNO_P (regno))
+ return SGPR_REGS;
+ if (regno < FIRST_VGPR_REG)
+ return GENERAL_REGS;
+ if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
+ return AFP_REGS;
+ return ALL_REGS;
+}
+
+/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
+
+ GCC assumes that lowpart contains first part of value as stored in memory.
+ This is not the case for vector registers. */
+
+bool
+gcn_can_change_mode_class (machine_mode from, machine_mode to,
+ reg_class_t regclass)
+{
+ if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
+ return true;
+ return (gcn_class_max_nregs (regclass, from)
+ == gcn_class_max_nregs (regclass, to));
+}
+
+/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
+
+ When this hook returns true for MODE, the compiler allows
+ registers explicitly used in the rtl to be used as spill registers
+ but prevents the compiler from extending the lifetime of these
+ registers. */
+
+bool
+gcn_small_register_classes_for_mode_p (machine_mode mode)
+{
+ /* We allocate into exec and vcc regs. Those make small register class. */
+ return mode == DImode || mode == SImode;
+}
+
+/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
+
+ Returns true if pseudos that have been assigned to registers of class RCLASS
+ would likely be spilled because registers of RCLASS are needed for spill
+ registers. */
+
+static bool
+gcn_class_likely_spilled_p (reg_class_t rclass)
+{
+ return (rclass == EXEC_MASK_REG
+ || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
+}
+
+/* Implement TARGET_MODES_TIEABLE_P.
+
+ Returns true if a value of MODE1 is accessible in MODE2 without
+ copying. */
+
+bool
+gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+{
+ return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
+ && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
+}
+
+/* Implement TARGET_TRULY_NOOP_TRUNCATION.
+
+ Returns true if it is safe to “convert” a value of INPREC bits to one of
+ OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
+ it as if it had only OUTPREC bits. */
+
+bool
+gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
+{
+ return ((inprec <= 32) && (outprec <= inprec));
+}
+
+/* Return N-th part of value occupying multiple registers. */
+
+rtx
+gcn_operand_part (machine_mode mode, rtx op, int n)
+{
+ if (GET_MODE_SIZE (mode) >= 256)
+ {
+ /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
+
+ if (REG_P (op))
+ {
+ gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
+ return gen_rtx_REG (V64SImode, REGNO (op) + n);
+ }
+ if (GET_CODE (op) == CONST_VECTOR)
+ {
+ int units = GET_MODE_NUNITS (mode);
+ rtvec v = rtvec_alloc (units);
+
+ for (int i = 0; i < units; ++i)
+ RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
+ CONST_VECTOR_ELT (op, i), n);
+
+ return gen_rtx_CONST_VECTOR (V64SImode, v);
+ }
+ if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
+ return gcn_gen_undef (V64SImode);
+ gcc_unreachable ();
+ }
+ else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
+ {
+ gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
+ return gen_rtx_REG (SImode, REGNO (op) + n);
+ }
+ else
+ {
+ if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
+ return gcn_gen_undef (SImode);
+ return simplify_gen_subreg (SImode, op, mode, n * 4);
+ }
+}
+
+/* Return N-th part of value occupying multiple registers. */
+
+rtx
+gcn_operand_doublepart (machine_mode mode, rtx op, int n)
+{
+ return simplify_gen_subreg (DImode, op, mode, n * 8);
+}
+
+/* Return true if OP can be split into subregs or high/low parts.
+ This is always true for scalars, but not normally true for vectors.
+ However, for vectors in hardregs we can use the low and high registers. */
+
+bool
+gcn_can_split_p (machine_mode, rtx op)
+{
+ if (vgpr_vector_mode_p (GET_MODE (op)))
+ {
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+ if (!REG_P (op))
+ return true;
+ return REGNO (op) <= FIRST_PSEUDO_REGISTER;
+ }
+ return true;
+}
+
+/* Implement TARGET_SPILL_CLASS.
+
+ Return class of registers which could be used for pseudo of MODE
+ and of class RCLASS for spilling instead of memory. Return NO_REGS
+ if it is not possible or non-profitable. */
+
+static reg_class_t
+gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
+{
+ if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c))
+ return SGPR_REGS;
+ else
+ return NO_REGS;
+}
+
+/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
+
+ Change allocno class for given pseudo from allocno and best class
+ calculated by IRA. */
+
+static reg_class_t
+gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
+ reg_class_t best_cl)
+{
+ /* Avoid returning classes that contain both vgpr and sgpr registers. */
+ if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
+ return cl;
+ if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
+ && best_cl != ALL_GPR_REGS)
+ return best_cl;
+
+ machine_mode mode = PSEUDO_REGNO_MODE (regno);
+ if (vgpr_vector_mode_p (mode))
+ return VGPR_REGS;
+
+ return GENERAL_REGS;
+}
+
+/* Create a new DImode pseudo reg and emit an instruction to initialize
+ it to VAL. */
+
+static rtx
+get_exec (int64_t val)
+{
+ rtx reg = gen_reg_rtx (DImode);
+ emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
+ return reg;
+}
+
+/* Return value of scalar exec register. */
+
+rtx
+gcn_scalar_exec ()
+{
+ return const1_rtx;
+}
+
+/* Return pseudo holding scalar exec register. */
+
+rtx
+gcn_scalar_exec_reg ()
+{
+ return get_exec (1);
+}
+
+/* Return value of full exec register. */
+
+rtx
+gcn_full_exec ()
+{
+ return constm1_rtx;
+}
+
+/* Return pseudo holding full exec register. */
+
+rtx
+gcn_full_exec_reg ()
+{
+ return get_exec (-1);
+}
+
+/* }}} */
+/* {{{ Immediate constants. */
+
+/* Initialize shared numeric constants. */
+
+static void
+init_ext_gcn_constants (void)
+{
+ real_from_integer (&dconst4, DFmode, 4, SIGNED);
+
+ /* FIXME: this constant probably does not match what hardware really loads.
+ Reality check it eventually. */
+ real_from_string (&dconst1over2pi,
+ "0.1591549430918953357663423455968866839");
+ real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
+
+ ext_gcn_constants_init = 1;
+}
+
+/* Return non-zero if X is a constant that can appear as an inline operand.
+ This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
+ Or a vector of those.
+ The value returned should be the encoding of this constant. */
+
+int
+gcn_inline_fp_constant_p (rtx x, bool allow_vector)
+{
+ machine_mode mode = GET_MODE (x);
+
+ if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
+ && allow_vector)
+ {
+ int n;
+ if (GET_CODE (x) != CONST_VECTOR)
+ return 0;
+ n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
+ if (!n)
+ return 0;
+ for (int i = 1; i < 64; i++)
+ if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
+ return 0;
+ return 1;
+ }
+
+ if (mode != HFmode && mode != SFmode && mode != DFmode)
+ return 0;
+
+ const REAL_VALUE_TYPE *r;
+
+ if (x == CONST0_RTX (mode))
+ return 128;
+ if (x == CONST1_RTX (mode))
+ return 242;
+
+ r = CONST_DOUBLE_REAL_VALUE (x);
+
+ if (real_identical (r, &dconstm1))
+ return 243;
+
+ if (real_identical (r, &dconsthalf))
+ return 240;
+ if (real_identical (r, &dconstm1))
+ return 243;
+ if (real_identical (r, &dconst2))
+ return 244;
+ if (real_identical (r, &dconst4))
+ return 246;
+ if (real_identical (r, &dconst1over2pi))
+ return 248;
+ if (!ext_gcn_constants_init)
+ init_ext_gcn_constants ();
+ real_value_negate (r);
+ if (real_identical (r, &dconsthalf))
+ return 241;
+ if (real_identical (r, &dconst2))
+ return 245;
+ if (real_identical (r, &dconst4))
+ return 247;
+
+ /* FIXME: add 4, -4 and 1/(2*PI). */
+
+ return 0;
+}
+
+/* Return non-zero if X is a constant that can appear as an immediate operand.
+ This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
+ Or a vector of those.
+ The value returned should be the encoding of this constant. */
+
+bool
+gcn_fp_constant_p (rtx x, bool allow_vector)
+{
+ machine_mode mode = GET_MODE (x);
+
+ if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
+ && allow_vector)
+ {
+ int n;
+ if (GET_CODE (x) != CONST_VECTOR)
+ return false;
+ n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
+ if (!n)
+ return false;
+ for (int i = 1; i < 64; i++)
+ if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
+ return false;
+ return true;
+ }
+ if (mode != HFmode && mode != SFmode && mode != DFmode)
+ return false;
+
+ if (gcn_inline_fp_constant_p (x, false))
+ return true;
+ /* FIXME: It is not clear how 32bit immediates are interpreted here. */
+ return (mode != DFmode);
+}
+
+/* Return true if X is a constant representable as an inline immediate
+ constant in a 32-bit instruction encoding. */
+
+bool
+gcn_inline_constant_p (rtx x)
+{
+ if (GET_CODE (x) == CONST_INT)
+ return INTVAL (x) >= -16 && INTVAL (x) < 64;
+ if (GET_CODE (x) == CONST_DOUBLE)
+ return gcn_inline_fp_constant_p (x, false);
+ if (GET_CODE (x) == CONST_VECTOR)
+ {
+ int n;
+ if (!vgpr_vector_mode_p (GET_MODE (x))
+ && GET_MODE (x) != V64BImode)
+ return false;
+ n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
+ if (!n)
+ return false;
+ for (int i = 1; i < 64; i++)
+ if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
+ return false;
+ return 1;
+ }
+ return false;
+}
+
+/* Return true if X is a constant representable as an immediate constant
+ in a 32 or 64-bit instruction encoding. */
+
+bool
+gcn_constant_p (rtx x)
+{
+ switch (GET_CODE (x))
+ {
+ case CONST_INT:
+ return true;
+
+ case CONST_DOUBLE:
+ return gcn_fp_constant_p (x, false);
+
+ case CONST_VECTOR:
+ {
+ int n;
+ if (!vgpr_vector_mode_p (GET_MODE (x))
+ && GET_MODE (x) != V64BImode)
+ return false;
+ n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
+ if (!n)
+ return false;
+ for (int i = 1; i < 64; i++)
+ if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
+ return false;
+ return true;
+ }
+
+ case SYMBOL_REF:
+ case LABEL_REF:
+ return true;
+
+ default:
+ ;
+ }
+
+ return false;
+}
+
+/* Return true if X is a constant representable as two inline immediate
+ constants in a 64-bit instruction that is split into two 32-bit
+ instructions. */
+
+bool
+gcn_inline_constant64_p (rtx x)
+{
+ machine_mode mode;
+
+ if (GET_CODE (x) == CONST_VECTOR)
+ {
+ int n;
+ if (!vgpr_vector_mode_p (GET_MODE (x))
+ && GET_MODE (x) != V64BImode)
+ return false;
+ if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0)))
+ return false;
+ for (int i = 1; i < 64; i++)
+ if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
+ return false;
+
+ return true;
+ }
+
+ if (GET_CODE (x) != CONST_INT)
+ return false;
+
+ rtx val_lo = gcn_operand_part (DImode, x, 0);
+ rtx val_hi = gcn_operand_part (DImode, x, 1);
+ return gcn_inline_constant_p (val_lo) && gcn_inline_constant_p (val_hi);
+}
+
+/* Return true if X is a constant representable as an immediate constant
+ in a 32 or 64-bit instruction encoding where the hardware will
+ extend the immediate to 64-bits. */
+
+bool
+gcn_constant64_p (rtx x)
+{
+ if (!gcn_constant_p (x))
+ return false;
+
+ if (GET_CODE (x) != CONST_INT)
+ return true;
+
+ /* Negative numbers are only allowed if they can be encoded within src0,
+ because the 32-bit immediates do not get sign-extended.
+ Unsigned numbers must not be encodable as 32-bit -1..-16, because the
+ assembler will use a src0 inline immediate and that will get
+ sign-extended. */
+ HOST_WIDE_INT val = INTVAL (x);
+ return (((val & 0xffffffff) == val /* Positive 32-bit. */
+ && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
+ || gcn_inline_constant_p (x)); /* Src0. */
+}
+
+/* Implement TARGET_LEGITIMATE_CONSTANT_P.
+
+ Returns true if X is a legitimate constant for a MODE immediate operand. */
+
+bool
+gcn_legitimate_constant_p (machine_mode, rtx x)
+{
+ return gcn_constant_p (x);
+}
+
+/* Return true if X is a CONST_VECTOR of single constant. */
+
+static bool
+single_cst_vector_p (rtx x)
+{
+ if (GET_CODE (x) != CONST_VECTOR)
+ return false;
+ for (int i = 1; i < 64; i++)
+ if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
+ return false;
+ return true;
+}
+
+/* Create a CONST_VECTOR of duplicated value A. */
+
+rtx
+gcn_vec_constant (machine_mode mode, int a)
+{
+ /*if (!a)
+ return CONST0_RTX (mode);
+ if (a == -1)
+ return CONSTM1_RTX (mode);
+ if (a == 1)
+ return CONST1_RTX (mode);
+ if (a == 2)
+ return CONST2_RTX (mode);*/
+
+ int units = GET_MODE_NUNITS (mode);
+ rtx tem = gen_int_mode (a, GET_MODE_INNER (mode));
+ rtvec v = rtvec_alloc (units);
+
+ for (int i = 0; i < units; ++i)
+ RTVEC_ELT (v, i) = tem;
+
+ return gen_rtx_CONST_VECTOR (mode, v);
+}
+
+/* Create a CONST_VECTOR of duplicated value A. */
+
+rtx
+gcn_vec_constant (machine_mode mode, rtx a)
+{
+ int units = GET_MODE_NUNITS (mode);
+ rtvec v = rtvec_alloc (units);
+
+ for (int i = 0; i < units; ++i)
+ RTVEC_ELT (v, i) = a;
+
+ return gen_rtx_CONST_VECTOR (mode, v);
+}
+
+/* Create an undefined vector value, used where an insn operand is
+ optional. */
+
+rtx
+gcn_gen_undef (machine_mode mode)
+{
+ return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
+}
+
+/* }}} */
+/* {{{ Addresses, pointers and moves. */
+
+/* Return true is REG is a valid place to store a pointer,
+ for instructions that require an SGPR.
+ FIXME rename. */
+
+static bool
+gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
+{
+ if (GET_CODE (reg) == SUBREG)
+ reg = SUBREG_REG (reg);
+
+ if (!REG_P (reg))
+ return false;
+
+ if (GET_MODE (reg) != mode)
+ return false;
+
+ int regno = REGNO (reg);
+
+ if (regno >= FIRST_PSEUDO_REGISTER)
+ {
+ if (!strict)
+ return true;
+
+ if (!reg_renumber)
+ return false;
+
+ regno = reg_renumber[regno];
+ }
+
+ return (regno < 102 || regno == M0_REG
+ || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
+}
+
+/* Return true is REG is a valid place to store a pointer,
+ for instructions that require a VGPR. */
+
+static bool
+gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
+{
+ if (GET_CODE (reg) == SUBREG)
+ reg = SUBREG_REG (reg);
+
+ if (!REG_P (reg))
+ return false;
+
+ if (GET_MODE (reg) != mode)
+ return false;
+
+ int regno = REGNO (reg);
+
+ if (regno >= FIRST_PSEUDO_REGISTER)
+ {
+ if (!strict)
+ return true;
+
+ if (!reg_renumber)
+ return false;
+
+ regno = reg_renumber[regno];
+ }
+
+ return VGPR_REGNO_P (regno);
+}
+
+/* Return true if X would be valid inside a MEM using the Flat address
+ space. */
+
+bool
+gcn_flat_address_p (rtx x, machine_mode mode)
+{
+ bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
+
+ if (vec_mode && gcn_address_register_p (x, DImode, false))
+ return true;
+
+ if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
+ return true;
+
+ if (TARGET_GCN5_PLUS
+ && GET_CODE (x) == PLUS
+ && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
+ && CONST_INT_P (XEXP (x, 1)))
+ return true;
+
+ return false;
+}
+
+/* Return true if X would be valid inside a MEM using the Scalar Flat
+ address space. */
+
+bool
+gcn_scalar_flat_address_p (rtx x)
+{
+ if (gcn_address_register_p (x, DImode, false))
+ return true;
+
+ if (GET_CODE (x) == PLUS
+ && gcn_address_register_p (XEXP (x, 0), DImode, false)
+ && CONST_INT_P (XEXP (x, 1)))
+ return true;
+
+ return false;
+}
+
+/* Return true if MEM X would be valid for the Scalar Flat address space. */
+
+bool
+gcn_scalar_flat_mem_p (rtx x)
+{
+ if (!MEM_P (x))
+ return false;
+
+ if (GET_MODE_SIZE (GET_MODE (x)) < 4)
+ return false;
+
+ return gcn_scalar_flat_address_p (XEXP (x, 0));
+}
+
+/* Return true if X would be valid inside a MEM using the LDS or GDS
+ address spaces. */
+
+bool
+gcn_ds_address_p (rtx x)
+{
+ if (gcn_vec_address_register_p (x, SImode, false))
+ return true;
+
+ if (GET_CODE (x) == PLUS
+ && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
+ && CONST_INT_P (XEXP (x, 1)))
+ return true;
+
+ return false;
+}
+
+/* Return true if ADDR would be valid inside a MEM using the Global
+ address space. */
+
+bool
+gcn_global_address_p (rtx addr)
+{
+ if (gcn_address_register_p (addr, DImode, false)
+ || gcn_vec_address_register_p (addr, DImode, false))
+ return true;
+
+ if (GET_CODE (addr) == PLUS)
+ {
+ rtx base = XEXP (addr, 0);
+ rtx offset = XEXP (addr, 1);
+ bool immediate_p = (CONST_INT_P (offset)
+ && INTVAL (offset) >= -(1 << 12)
+ && INTVAL (offset) < (1 << 12));
+
+ if ((gcn_address_register_p (base, DImode, false)
+ || gcn_vec_address_register_p (base, DImode, false))
+ && immediate_p)
+ /* SGPR + CONST or VGPR + CONST */
+ return true;
+
+ if (gcn_address_register_p (base, DImode, false)
+ && gcn_vgpr_register_operand (offset, SImode))
+ /* SPGR + VGPR */
+ return true;
+
+ if (GET_CODE (base) == PLUS
+ && gcn_address_register_p (XEXP (base, 0), DImode, false)
+ && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
+ && immediate_p)
+ /* (SGPR + VGPR) + CONST */
+ return true;
+ }
+
+ return false;
+}
+
+/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
+
+ Recognizes RTL expressions that are valid memory addresses for an
+ instruction. The MODE argument is the machine mode for the MEM
+ expression that wants to use this address.
+
+ It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
+ convert common non-canonical forms to canonical form so that they will
+ be recognized. */
+
+static bool
+gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
+ addr_space_t as)
+{
+ /* All vector instructions need to work on addresses in registers. */
+ if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
+ return false;
+
+ if (AS_SCALAR_FLAT_P (as))
+ switch (GET_CODE (x))
+ {
+ case REG:
+ return gcn_address_register_p (x, DImode, strict);
+ /* Addresses are in the form BASE+OFFSET
+ OFFSET is either 20bit unsigned immediate, SGPR or M0.
+ Writes and atomics do not accept SGPR. */
+ case PLUS:
+ {
+ rtx x0 = XEXP (x, 0);
+ rtx x1 = XEXP (x, 1);
+ if (!gcn_address_register_p (x0, DImode, strict))
+ return false;
+ /* FIXME: This is disabled because of the mode mismatch between
+ SImode (for the address or m0 register) and the DImode PLUS.
+ We'll need a zero_extend or similar.
+
+ if (gcn_m0_register_p (x1, SImode, strict)
+ || gcn_address_register_p (x1, SImode, strict))
+ return true;
+ else*/
+ if (GET_CODE (x1) == CONST_INT)
+ {
+ if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
+ /* The low bits of the offset are ignored, even when
+ they're meant to realign the pointer. */
+ && !(INTVAL (x1) & 0x3))
+ return true;
+ }
+ return false;
+ }
+
+ default:
+ break;
+ }
+ else if (AS_SCRATCH_P (as))
+ return gcn_address_register_p (x, SImode, strict);
+ else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
+ {
+ if (TARGET_GCN3 || GET_CODE (x) == REG)
+ return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+ ? gcn_address_register_p (x, DImode, strict)
+ : gcn_vec_address_register_p (x, DImode, strict));
+ else
+ {
+ gcc_assert (TARGET_GCN5_PLUS);
+
+ if (GET_CODE (x) == PLUS)
+ {
+ rtx x1 = XEXP (x, 1);
+
+ if (VECTOR_MODE_P (mode)
+ ? !gcn_address_register_p (x, DImode, strict)
+ : !gcn_vec_address_register_p (x, DImode, strict))
+ return false;
+
+ if (GET_CODE (x1) == CONST_INT)
+ {
+ if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
+ /* The low bits of the offset are ignored, even when
+ they're meant to realign the pointer. */
+ && !(INTVAL (x1) & 0x3))
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+ else if (AS_GLOBAL_P (as))
+ {
+ gcc_assert (TARGET_GCN5_PLUS);
+
+ if (GET_CODE (x) == REG)
+ return (gcn_address_register_p (x, DImode, strict)
+ || (!VECTOR_MODE_P (mode)
+ && gcn_vec_address_register_p (x, DImode, strict)));
+ else if (GET_CODE (x) == PLUS)
+ {
+ rtx base = XEXP (x, 0);
+ rtx offset = XEXP (x, 1);
+
+ bool immediate_p = (GET_CODE (offset) == CONST_INT
+ /* Signed 13-bit immediate. */
+ && INTVAL (offset) >= -(1 << 12)
+ && INTVAL (offset) < (1 << 12)
+ /* The low bits of the offset are ignored, even
+ when they're meant to realign the pointer. */
+ && !(INTVAL (offset) & 0x3));
+
+ if (!VECTOR_MODE_P (mode))
+ {
+ if ((gcn_address_register_p (base, DImode, strict)
+ || gcn_vec_address_register_p (base, DImode, strict))
+ && immediate_p)
+ /* SGPR + CONST or VGPR + CONST */
+ return true;
+
+ if (gcn_address_register_p (base, DImode, strict)
+ && gcn_vgpr_register_operand (offset, SImode))
+ /* SGPR + VGPR */
+ return true;
+
+ if (GET_CODE (base) == PLUS
+ && gcn_address_register_p (XEXP (base, 0), DImode, strict)
+ && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
+ && immediate_p)
+ /* (SGPR + VGPR) + CONST */
+ return true;
+ }
+ else
+ {
+ if (gcn_address_register_p (base, DImode, strict)
+ && immediate_p)
+ /* SGPR + CONST */
+ return true;
+ }
+ }
+ else
+ return false;
+ }
+ else if (AS_ANY_DS_P (as))
+ switch (GET_CODE (x))
+ {
+ case REG:
+ return (VECTOR_MODE_P (mode)
+ ? gcn_address_register_p (x, SImode, strict)
+ : gcn_vec_address_register_p (x, SImode, strict));
+ /* Addresses are in the form BASE+OFFSET
+ OFFSET is either 20bit unsigned immediate, SGPR or M0.
+ Writes and atomics do not accept SGPR. */
+ case PLUS:
+ {
+ rtx x0 = XEXP (x, 0);
+ rtx x1 = XEXP (x, 1);
+ if (!gcn_vec_address_register_p (x0, DImode, strict))
+ return false;
+ if (GET_CODE (x1) == REG)
+ {
+ if (GET_CODE (x1) != REG
+ || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
+ && !gcn_ssrc_register_operand (x1, DImode)))
+ return false;
+ }
+ else if (GET_CODE (x1) == CONST_VECTOR
+ && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
+ && single_cst_vector_p (x1))
+ {
+ x1 = CONST_VECTOR_ELT (x1, 0);
+ if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
+ return true;
+ }
+ return false;
+ }
+
+ default:
+ break;
+ }
+ else
+ gcc_unreachable ();
+ return false;
+}
+
+/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
+
+ Return the appropriate mode for a named address pointer. */
+
+static scalar_int_mode
+gcn_addr_space_pointer_mode (addr_space_t addrspace)
+{
+ switch (addrspace)
+ {
+ case ADDR_SPACE_SCRATCH:
+ case ADDR_SPACE_LDS:
+ case ADDR_SPACE_GDS:
+ return SImode;
+ case ADDR_SPACE_DEFAULT:
+ case ADDR_SPACE_FLAT:
+ case ADDR_SPACE_FLAT_SCRATCH:
+ case ADDR_SPACE_SCALAR_FLAT:
+ return DImode;
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
+
+ Return the appropriate mode for a named address space address. */
+
+static scalar_int_mode
+gcn_addr_space_address_mode (addr_space_t addrspace)
+{
+ return gcn_addr_space_pointer_mode (addrspace);
+}
+
+/* Implement TARGET_ADDR_SPACE_SUBSET_P.
+
+ Determine if one named address space is a subset of another. */
+
+static bool
+gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
+{
+ if (subset == superset)
+ return true;
+ /* FIXME is this true? */
+ if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
+ return true;
+ return false;
+}
+
+/* Convert from one address space to another. */
+
+static rtx
+gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
+{
+ gcc_assert (POINTER_TYPE_P (from_type));
+ gcc_assert (POINTER_TYPE_P (to_type));
+
+ addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
+ addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
+
+ if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
+ {
+ rtx queue = gen_rtx_REG (DImode,
+ cfun->machine->args.reg[QUEUE_PTR_ARG]);
+ rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
+ gen_rtx_PLUS (DImode, queue,
+ gen_int_mode (64, SImode)));
+ rtx tmp = gen_reg_rtx (DImode);
+
+ emit_move_insn (gen_lowpart (SImode, tmp), op);
+ emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
+ group_seg_aperture_hi);
+
+ return tmp;
+ }
+ else if (as_from == as_to)
+ return op;
+ else
+ gcc_unreachable ();
+}
+
+
+/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
+
+ Retun true if REGNO is OK for memory adressing. */
+
+bool
+gcn_regno_mode_code_ok_for_base_p (int regno,
+ machine_mode, addr_space_t as, int, int)
+{
+ if (regno >= FIRST_PSEUDO_REGISTER)
+ {
+ if (reg_renumber)
+ regno = reg_renumber[regno];
+ else
+ return true;
+ }
+ if (AS_FLAT_P (as))
+ return (VGPR_REGNO_P (regno)
+ || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
+ else if (AS_SCALAR_FLAT_P (as))
+ return (SGPR_REGNO_P (regno)
+ || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
+ else if (AS_GLOBAL_P (as))
+ {
+ return (SGPR_REGNO_P (regno)
+ || VGPR_REGNO_P (regno)
+ || regno == ARG_POINTER_REGNUM
+ || regno == FRAME_POINTER_REGNUM);
+ }
+ else
+ /* For now. */
+ return false;
+}
+
+/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
+
+ Return a suitable register class for memory addressing. */
+
+reg_class
+gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
+ int ic)
+{
+ switch (as)
+ {
+ case ADDR_SPACE_DEFAULT:
+ return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
+ case ADDR_SPACE_SCALAR_FLAT:
+ case ADDR_SPACE_SCRATCH:
+ return SGPR_REGS;
+ break;
+ case ADDR_SPACE_FLAT:
+ case ADDR_SPACE_FLAT_SCRATCH:
+ case ADDR_SPACE_LDS:
+ case ADDR_SPACE_GDS:
+ return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+ ? SGPR_REGS : VGPR_REGS);
+ case ADDR_SPACE_GLOBAL:
+ return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+ ? SGPR_REGS : ALL_GPR_REGS);
+ }
+ gcc_unreachable ();
+}
+
+/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
+
+ Return true if REGNO is OK for index of memory addressing. */
+
+bool
+regno_ok_for_index_p (int regno)
+{
+ if (regno >= FIRST_PSEUDO_REGISTER)
+ {
+ if (reg_renumber)
+ regno = reg_renumber[regno];
+ else
+ return true;
+ }
+ return regno == M0_REG || VGPR_REGNO_P (regno);
+}
+
+/* Generate move which uses the exec flags. If EXEC is NULL, then it is
+ assumed that all lanes normally relevant to the mode of the move are
+ affected. If PREV is NULL, then a sensible default is supplied for
+ the inactive lanes. */
+
+static rtx
+gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
+{
+ machine_mode mode = GET_MODE (op0);
+
+ if (vgpr_vector_mode_p (mode))
+ {
+ if (exec && exec != CONSTM1_RTX (DImode))
+ {
+ if (!prev)
+ prev = op0;
+ }
+ else
+ {
+ if (!prev)
+ prev = gcn_gen_undef (mode);
+ exec = gcn_full_exec_reg ();
+ }
+
+ rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
+
+ return gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (2, set,
+ gen_rtx_CLOBBER (VOIDmode,
+ gen_rtx_SCRATCH (V64DImode))));
+ }
+
+ return (gen_rtx_PARALLEL
+ (VOIDmode,
+ gen_rtvec (2, gen_rtx_SET (op0, op1),
+ gen_rtx_USE (VOIDmode,
+ exec ? exec : gcn_scalar_exec ()))));
+}
+
+/* Generate masked move. */
+
+static rtx
+gen_masked_scalar_load (rtx op0, rtx op1, rtx op2, rtx exec)
+{
+ return (gen_rtx_SET (op0,
+ gen_rtx_VEC_MERGE (GET_MODE (op0),
+ gen_rtx_VEC_DUPLICATE (GET_MODE
+ (op0), op1),
+ op2, exec)));
+}
+
+/* Expand vector init of OP0 by VEC.
+ Implements vec_init instruction pattern. */
+
+void
+gcn_expand_vector_init (rtx op0, rtx vec)
+{
+ int64_t initialized_mask = 0;
+ int64_t curr_mask = 1;
+ machine_mode mode = GET_MODE (op0);
+
+ rtx val = XVECEXP (vec, 0, 0);
+
+ for (int i = 1; i < 64; i++)
+ if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
+ curr_mask |= (int64_t) 1 << i;
+
+ if (gcn_constant_p (val))
+ emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val)));
+ else
+ {
+ val = force_reg (GET_MODE_INNER (mode), val);
+ emit_insn (gen_masked_scalar_load (op0, val, gcn_gen_undef (mode),
+ gcn_full_exec_reg ()));
+ }
+ initialized_mask |= curr_mask;
+ for (int i = 1; i < 64; i++)
+ if (!(initialized_mask & ((int64_t) 1 << i)))
+ {
+ curr_mask = (int64_t) 1 << i;
+ rtx val = XVECEXP (vec, 0, i);
+
+ for (int j = i + 1; j < 64; j++)
+ if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
+ curr_mask |= (int64_t) 1 << j;
+ if (gcn_constant_p (val))
+ emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
+ get_exec (curr_mask)));
+ else
+ {
+ val = force_reg (GET_MODE_INNER (mode), val);
+ emit_insn (gen_masked_scalar_load (op0, val, op0,
+ get_exec (curr_mask)));
+ }
+ initialized_mask |= curr_mask;
+ }
+}
+
+/* Load vector constant where n-th lane contains BASE+n*VAL. */
+
+static rtx
+strided_constant (machine_mode mode, int base, int val)
+{
+ rtx x = gen_reg_rtx (mode);
+ emit_insn (gen_mov_with_exec (x, gcn_vec_constant (mode, base)));
+ emit_insn (gen_addv64si3_vector (x, x, gcn_vec_constant (mode, val * 32),
+ get_exec (0xffffffff00000000), x));
+ emit_insn (gen_addv64si3_vector (x, x, gcn_vec_constant (mode, val * 16),
+ get_exec (0xffff0000ffff0000), x));
+ emit_insn (gen_addv64si3_vector (x, x, gcn_vec_constant (mode, val * 8),
+ get_exec (0xff00ff00ff00ff00), x));
+ emit_insn (gen_addv64si3_vector (x, x, gcn_vec_constant (mode, val * 4),
+ get_exec (0xf0f0f0f0f0f0f0f0), x));
+ emit_insn (gen_addv64si3_vector (x, x, gcn_vec_constant (mode, val * 2),
+ get_exec (0xcccccccccccccccc), x));
+ emit_insn (gen_addv64si3_vector (x, x, gcn_vec_constant (mode, val * 1),
+ get_exec (0xaaaaaaaaaaaaaaaa), x));
+ return x;
+}
+
+/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
+
+static rtx
+gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
+ addr_space_t as)
+{
+ switch (as)
+ {
+ case ADDR_SPACE_DEFAULT:
+ return gcn_addr_space_legitimize_address (x, old, mode,
+ DEFAULT_ADDR_SPACE);
+ case ADDR_SPACE_SCALAR_FLAT:
+ case ADDR_SPACE_SCRATCH:
+ /* Instructions working on vectors need the address to be in
+ a register. */
+ if (vgpr_vector_mode_p (mode))
+ return force_reg (GET_MODE (x), x);
+
+ return x;
+ case ADDR_SPACE_FLAT:
+ case ADDR_SPACE_FLAT_SCRATCH:
+ case ADDR_SPACE_GLOBAL:
+ return TARGET_GCN3 ? force_reg (DImode, x) : x;
+ case ADDR_SPACE_LDS:
+ case ADDR_SPACE_GDS:
+ /* FIXME: LDS support offsets, handle them!. */
+ if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
+ {
+ rtx exec = gcn_full_exec_reg ();
+ rtx addrs = gen_reg_rtx (V64SImode);
+ rtx base = force_reg (SImode, x);
+ rtx offsets = strided_constant (V64SImode, 0,
+ GET_MODE_UNIT_SIZE (mode));
+
+ emit_insn (gen_vec_duplicatev64si_exec
+ (addrs, base, exec, gcn_gen_undef (V64SImode)));
+
+ emit_insn (gen_addv64si3_vector (addrs, offsets, addrs, exec,
+ gcn_gen_undef (V64SImode)));
+ return addrs;
+ }
+ return x;
+ }
+ gcc_unreachable ();
+}
+
+/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
+ proper vector of stepped addresses.
+
+ MEM will be a DImode address of a vector in an SGPR.
+ TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
+
+rtx
+gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
+ rtx tmp)
+{
+ gcc_assert (MEM_P (mem));
+ rtx mem_base = XEXP (mem, 0);
+ rtx mem_index = NULL_RTX;
+
+ if (!TARGET_GCN5_PLUS)
+ {
+ /* gcn_addr_space_legitimize_address should have put the address in a
+ register. If not, it is too late to do anything about it. */
+ gcc_assert (REG_P (mem_base));
+ }
+
+ if (GET_CODE (mem_base) == PLUS)
+ {
+ mem_index = XEXP (mem_base, 1);
+ mem_base = XEXP (mem_base, 0);
+ }
+
+ /* RF and RM base registers for vector modes should be always an SGPR. */
+ gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
+ || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
+
+ machine_mode inner = GET_MODE_INNER (mode);
+ int shift = exact_log2 (GET_MODE_SIZE (inner));
+ rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ rtx undef_v64si = gcn_gen_undef (V64SImode);
+ rtx new_base = NULL_RTX;
+ addr_space_t as = MEM_ADDR_SPACE (mem);
+
+ rtx tmplo = (REG_P (tmp)
+ ? gcn_operand_part (V64DImode, tmp, 0)
+ : gen_reg_rtx (V64SImode));
+
+ /* tmplo[:] = ramp[:] << shift */
+ if (exec)
+ emit_insn (gen_ashlv64si3_vector (tmplo, ramp,
+ gen_int_mode (shift, SImode),
+ exec, undef_v64si));
+ else
+ emit_insn (gen_ashlv64si3_full (tmplo, ramp,
+ gen_int_mode (shift, SImode)));
+
+ if (AS_FLAT_P (as))
+ {
+ if (REG_P (tmp))
+ {
+ rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
+ rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
+ rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
+ rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
+
+ /* tmphi[:] = mem_base_hi */
+ if (exec)
+ emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi, exec,
+ undef_v64si));
+ else
+ emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
+
+ /* tmp[:] += zext (mem_base) */
+ if (exec)
+ {
+ rtx undef_di = gcn_gen_undef (DImode);
+ emit_insn (gen_addv64si3_vector_vcc_dup (tmplo, tmplo, mem_base_lo,
+ exec, undef_v64si, vcc,
+ undef_di));
+ emit_insn (gen_addcv64si3_vec (tmphi, tmphi, const0_rtx, exec,
+ undef_v64si, vcc, vcc,
+ gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ undef_di));
+ }
+ else
+ emit_insn (gen_addv64di3_scalarsi (tmp, tmp, mem_base_lo));
+ }
+ else
+ {
+ tmp = gen_reg_rtx (V64DImode);
+ emit_insn (gen_addv64di3_zext_dup2 (tmp, tmplo, mem_base, exec,
+ gcn_gen_undef (V64DImode)));
+ }
+
+ new_base = tmp;
+ }
+ else if (AS_ANY_DS_P (as))
+ {
+ if (!exec)
+ exec = gen_rtx_CONST_INT (VOIDmode, -1);
+
+ emit_insn (gen_addv64si3_vector_dup (tmplo, tmplo, mem_base, exec,
+ gcn_gen_undef (V64SImode)));
+ new_base = tmplo;
+ }
+ else
+ {
+ mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
+ new_base = gen_rtx_PLUS (V64DImode, mem_base,
+ gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
+ }
+
+ return gen_rtx_PLUS (GET_MODE (new_base), new_base,
+ gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
+ (mem_index ? mem_index
+ : const0_rtx)));
+}
+
+/* Return true if move from OP0 to OP1 is known to be executed in vector
+ unit. */
+
+bool
+gcn_vgpr_move_p (rtx op0, rtx op1)
+{
+ if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
+ return true;
+ if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
+ return true;
+ return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
+ || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
+ || vgpr_vector_mode_p (GET_MODE (op0)));
+}
+
+/* Return true if move from OP0 to OP1 is known to be executed in scalar
+ unit. Used in the machine description. */
+
+bool
+gcn_sgpr_move_p (rtx op0, rtx op1)
+{
+ if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
+ return true;
+ if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
+ return true;
+ if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
+ || VGPR_REGNO_P (REGNO (op0)))
+ return false;
+ if (REG_P (op1)
+ && REGNO (op1) < FIRST_PSEUDO_REGISTER
+ && !VGPR_REGNO_P (REGNO (op1)))
+ return true;
+ return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
+}
+
+/* Implement TARGET_SECONDARY_RELOAD.
+
+ The address space determines which registers can be used for loads and
+ stores. */
+
+static reg_class_t
+gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
+ machine_mode reload_mode, secondary_reload_info *sri)
+{
+ reg_class_t result = NO_REGS;
+ bool spilled_pseudo =
+ (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "gcn_secondary_reload: ");
+ dump_value_slim (dump_file, x, 1);
+ fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
+ reg_class_names[rclass], GET_MODE_NAME (reload_mode));
+ if (REG_P (x) || GET_CODE (x) == SUBREG)
+ fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
+ (true_regnum (x) >= 0
+ && true_regnum (x) < FIRST_PSEUDO_REGISTER
+ ? reg_names[true_regnum (x)]
+ : (spilled_pseudo ? "stack spill" : "??")));
+ fprintf (dump_file, "\n");
+ }
+
+ /* Some callers don't use or initialize icode. */
+ sri->icode = CODE_FOR_nothing;
+
+ if (MEM_P (x) || spilled_pseudo)
+ {
+ addr_space_t as = DEFAULT_ADDR_SPACE;
+
+ /* If we have a spilled pseudo, we can't find the address space
+ directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
+ ADDR_SPACE_GLOBAL for GCN5. */
+ if (MEM_P (x))
+ as = MEM_ADDR_SPACE (x);
+
+ if (as == ADDR_SPACE_DEFAULT)
+ as = DEFAULT_ADDR_SPACE;
+
+ switch (as)
+ {
+ case ADDR_SPACE_SCALAR_FLAT:
+ result =
+ ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
+ break;
+ case ADDR_SPACE_FLAT:
+ case ADDR_SPACE_FLAT_SCRATCH:
+ case ADDR_SPACE_GLOBAL:
+ if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
+ {
+ if (in_p)
+ switch (reload_mode)
+ {
+ case E_V64SImode:
+ sri->icode = CODE_FOR_reload_inv64si;
+ break;
+ case E_V64SFmode:
+ sri->icode = CODE_FOR_reload_inv64sf;
+ break;
+ case E_V64HImode:
+ sri->icode = CODE_FOR_reload_inv64hi;
+ break;
+ case E_V64HFmode:
+ sri->icode = CODE_FOR_reload_inv64hf;
+ break;
+ case E_V64QImode:
+ sri->icode = CODE_FOR_reload_inv64qi;
+ break;
+ case E_V64DImode:
+ sri->icode = CODE_FOR_reload_inv64di;
+ break;
+ case E_V64DFmode:
+ sri->icode = CODE_FOR_reload_inv64df;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ else
+ switch (reload_mode)
+ {
+ case E_V64SImode:
+ sri->icode = CODE_FOR_reload_outv64si;
+ break;
+ case E_V64SFmode:
+ sri->icode = CODE_FOR_reload_outv64sf;
+ break;
+ case E_V64HImode:
+ sri->icode = CODE_FOR_reload_outv64hi;
+ break;
+ case E_V64HFmode:
+ sri->icode = CODE_FOR_reload_outv64hf;
+ break;
+ case E_V64QImode:
+ sri->icode = CODE_FOR_reload_outv64qi;
+ break;
+ case E_V64DImode:
+ sri->icode = CODE_FOR_reload_outv64di;
+ break;
+ case E_V64DFmode:
+ sri->icode = CODE_FOR_reload_outv64df;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ break;
+ }
+ /* Fallthrough. */
+ case ADDR_SPACE_LDS:
+ case ADDR_SPACE_GDS:
+ case ADDR_SPACE_SCRATCH:
+ result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
+ break;
+ }
+ }
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
+ get_insn_name (sri->icode));
+
+ return result;
+}
+
+/* Update register usage after having seen the compiler flags and kernel
+ attributes. We typically want to fix registers that contain values
+ set by the HSA runtime. */
+
+static void
+gcn_conditional_register_usage (void)
+{
+ int i;
+
+ /* FIXME: Do we need to reset fixed_regs? */
+
+ if (!cfun || !cfun->machine || cfun->machine->normal_function)
+ {
+ /* Normal functions can't know what kernel argument registers are
+ live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs. */
+ for (i = 0; i < 16; i++)
+ fixed_regs[FIRST_SGPR_REG + i] = 1;
+ for (i = 0; i < 3; i++)
+ fixed_regs[FIRST_VGPR_REG + i] = 1;
+ return;
+ }
+
+ /* Fix the runtime argument register containing values that may be
+ needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
+ needed after the prologue so there's no need to fix them. */
+ if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
+ fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
+ if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+ {
+ fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
+ fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
+ fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 2] = 1;
+ fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 3] = 1;
+ }
+ if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
+ {
+ fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
+ fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
+ }
+ if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
+ {
+ fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
+ fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
+ }
+ if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
+ fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
+ if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
+ fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
+ if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
+ fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
+ if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
+ fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
+
+ if (TARGET_GCN5_PLUS)
+ /* v0 is always zero, for global nul-offsets. */
+ fixed_regs[VGPR_REGNO (0)] = 1;
+}
+
+/* Determine if a load or store is valid, according to the register classes
+ and address space. Used primarily by the machine description to decide
+ when to split a move into two steps. */
+
+bool
+gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
+{
+ if (!MEM_P (dest) && !MEM_P (src))
+ return true;
+
+ if (MEM_P (dest)
+ && AS_FLAT_P (MEM_ADDR_SPACE (dest))
+ && (gcn_flat_address_p (XEXP (dest, 0), mode)
+ || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
+ || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
+ && gcn_vgpr_register_operand (src, mode))
+ return true;
+ else if (MEM_P (src)
+ && AS_FLAT_P (MEM_ADDR_SPACE (src))
+ && (gcn_flat_address_p (XEXP (src, 0), mode)
+ || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
+ || GET_CODE (XEXP (src, 0)) == LABEL_REF)
+ && gcn_vgpr_register_operand (dest, mode))
+ return true;
+
+ if (MEM_P (dest)
+ && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
+ && (gcn_global_address_p (XEXP (dest, 0))
+ || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
+ || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
+ && gcn_vgpr_register_operand (src, mode))
+ return true;
+ else if (MEM_P (src)
+ && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
+ && (gcn_global_address_p (XEXP (src, 0))
+ || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
+ || GET_CODE (XEXP (src, 0)) == LABEL_REF)
+ && gcn_vgpr_register_operand (dest, mode))
+ return true;
+
+ if (MEM_P (dest)
+ && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
+ && (gcn_scalar_flat_address_p (XEXP (dest, 0))
+ || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
+ || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
+ && gcn_ssrc_register_operand (src, mode))
+ return true;
+ else if (MEM_P (src)
+ && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
+ && (gcn_scalar_flat_address_p (XEXP (src, 0))
+ || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
+ || GET_CODE (XEXP (src, 0)) == LABEL_REF)
+ && gcn_sdst_register_operand (dest, mode))
+ return true;
+
+ if (MEM_P (dest)
+ && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
+ && gcn_ds_address_p (XEXP (dest, 0))
+ && gcn_vgpr_register_operand (src, mode))
+ return true;
+ else if (MEM_P (src)
+ && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
+ && gcn_ds_address_p (XEXP (src, 0))
+ && gcn_vgpr_register_operand (dest, mode))
+ return true;
+
+ return false;
+}
+
+/* }}} */
+/* {{{ Functions and ABI. */
+
+/* Implement TARGET_FUNCTION_VALUE.
+
+ Define how to find the value returned by a function.
+ The register location is always the same, but the mode depends on
+ VALTYPE. */
+
+static rtx
+gcn_function_value (const_tree valtype, const_tree, bool)
+{
+ machine_mode mode = TYPE_MODE (valtype);
+
+ if (INTEGRAL_TYPE_P (valtype)
+ && GET_MODE_CLASS (mode) == MODE_INT
+ && GET_MODE_SIZE (mode) < 4)
+ mode = SImode;
+
+ return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
+}
+
+/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
+
+ Return true if N is a possible register number for the function return
+ value. */
+
+static bool
+gcn_function_value_regno_p (const unsigned int n)
+{
+ return n == RETURN_VALUE_REG;
+}
+
+/* Calculate the number of registers required to hold a function argument
+ of MODE and TYPE. */
+
+static int
+num_arg_regs (machine_mode mode, const_tree type)
+{
+ int size;
+
+ if (targetm.calls.must_pass_in_stack (mode, type))
+ return 0;
+
+ if (type && mode == BLKmode)
+ size = int_size_in_bytes (type);
+ else
+ size = GET_MODE_SIZE (mode);
+
+ return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+}
+
+/* Implement TARGET_STRICT_ARGUMENT_NAMING.
+
+ Return true if the location where a function argument is passed
+ depends on whether or not it is a named argument
+
+ For gcn, we know how to handle functions declared as stdarg: by
+ passing an extra pointer to the unnamed arguments. However, the
+ Fortran frontend can produce a different situation, where a
+ function pointer is declared with no arguments, but the actual
+ function and calls to it take more arguments. In that case, we
+ want to ensure the call matches the definition of the function. */
+
+static bool
+gcn_strict_argument_naming (cumulative_args_t cum_v)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
+ return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
+}
+
+/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
+
+ See comment on gcn_strict_argument_naming. */
+
+static bool
+gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
+{
+ return !gcn_strict_argument_naming (cum_v);
+}
+
+/* Implement TARGET_FUNCTION_ARG.
+
+ Return an RTX indicating whether a function argument is passed in a register
+ and if so, which register. */
+
+static rtx
+gcn_function_arg (cumulative_args_t cum_v, machine_mode mode, const_tree type,
+ bool named)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ if (cum->normal_function)
+ {
+ if (!named || mode == VOIDmode)
+ return 0;
+
+ if (targetm.calls.must_pass_in_stack (mode, type))
+ return 0;
+
+ int reg_num = FIRST_PARM_REG + cum->num;
+ int num_regs = num_arg_regs (mode, type);
+ if (num_regs > 0)
+ while (reg_num % num_regs != 0)
+ reg_num++;
+ if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
+ return gen_rtx_REG (mode, reg_num);
+ }
+ else
+ {
+ if (cum->num >= cum->args.nargs)
+ {
+ cum->offset = (cum->offset + TYPE_ALIGN (type) / 8 - 1)
+ & -(TYPE_ALIGN (type) / 8);
+ cfun->machine->kernarg_segment_alignment
+ = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
+ TYPE_ALIGN (type) / 8);
+ rtx addr = gen_rtx_REG (DImode,
+ cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
+ if (cum->offset)
+ addr = gen_rtx_PLUS (DImode, addr,
+ gen_int_mode (cum->offset, DImode));
+ rtx mem = gen_rtx_MEM (mode, addr);
+ set_mem_attributes (mem, const_cast<tree>(type), 1);
+ set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
+ MEM_READONLY_P (mem) = 1;
+ return mem;
+ }
+
+ int a = cum->args.order[cum->num];
+ if (mode != gcn_kernel_arg_types[a].mode)
+ {
+ error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
+ return 0;
+ }
+ return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
+ cum->args.reg[a]);
+ }
+ return 0;
+}
+
+/* Implement TARGET_FUNCTION_ARG_ADVANCE.
+
+ Updates the summarizer variable pointed to by CUM_V to advance past an
+ argument in the argument list. */
+
+static void
+gcn_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
+ const_tree type, bool named)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
+ if (cum->normal_function)
+ {
+ if (!named)
+ return;
+
+ int num_regs = num_arg_regs (mode, type);
+ if (num_regs > 0)
+ while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
+ cum->num++;
+ cum->num += num_regs;
+ }
+ else
+ {
+ if (cum->num < cum->args.nargs)
+ cum->num++;
+ else
+ {
+ cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (type));
+ cfun->machine->kernarg_segment_byte_size = cum->offset;
+ }
+ }
+}
+
+/* Implement TARGET_ARG_PARTIAL_BYTES.
+
+ Returns the number of bytes at the beginning of an argument that must be put
+ in registers. The value must be zero for arguments that are passed entirely
+ in registers or that are entirely pushed on the stack. */
+
+static int
+gcn_arg_partial_bytes (cumulative_args_t cum_v, machine_mode mode, tree type,
+ bool named)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
+ if (!named)
+ return 0;
+
+ if (targetm.calls.must_pass_in_stack (mode, type))
+ return 0;
+
+ if (cum->num >= NUM_PARM_REGS)
+ return 0;
+
+ /* If the argument fits entirely in registers, return 0. */
+ if (cum->num + num_arg_regs (mode, type) <= NUM_PARM_REGS)
+ return 0;
+
+ return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
+}
+
+/* A normal function which takes a pointer argument (to a scalar) may be
+ passed a pointer to LDS space (via a high-bits-set aperture), and that only
+ works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
+ function has an incoming pointer-to-scalar parameter. */
+
+static void
+gcn_detect_incoming_pointer_arg (tree fndecl)
+{
+ gcc_assert (cfun && cfun->machine);
+
+ for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
+ arg;
+ arg = TREE_CHAIN (arg))
+ if (POINTER_TYPE_P (TREE_VALUE (arg))
+ && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
+ cfun->machine->use_flat_addressing = true;
+}
+
+/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
+
+ Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
+ whose data type is FNTYPE. For a library call, FNTYPE is 0. */
+
+void
+gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
+ tree fntype /* tree ptr for function decl */ ,
+ rtx libname /* SYMBOL_REF of library name or 0 */ ,
+ tree fndecl, int caller)
+{
+ memset (cum, 0, sizeof (*cum));
+ cum->fntype = fntype;
+ if (libname)
+ {
+ gcc_assert (cfun && cfun->machine);
+ cum->normal_function = true;
+ if (!caller)
+ {
+ cfun->machine->normal_function = true;
+ gcn_detect_incoming_pointer_arg (fndecl);
+ }
+ return;
+ }
+ tree attr = NULL;
+ if (fndecl)
+ attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
+ if (fndecl && !attr)
+ attr = lookup_attribute ("amdgpu_hsa_kernel",
+ TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
+ if (!attr && fntype)
+ attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
+ /* Handle main () as kernel, so we can run testsuite.
+ Handle OpenACC kernels similarly to main. */
+ if (!attr && !caller && fndecl
+ && (MAIN_NAME_P (DECL_NAME (fndecl))
+ || lookup_attribute ("omp target entrypoint",
+ DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
+ gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
+ else
+ {
+ if (!attr || caller)
+ {
+ gcc_assert (cfun && cfun->machine);
+ cum->normal_function = true;
+ if (!caller)
+ cfun->machine->normal_function = true;
+ }
+ gcn_parse_amdgpu_hsa_kernel_attribute
+ (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
+ }
+ cfun->machine->args = cum->args;
+ if (!caller && cfun->machine->normal_function)
+ gcn_detect_incoming_pointer_arg (fndecl);
+}
+
+static bool
+gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
+{
+ machine_mode mode = TYPE_MODE (type);
+ HOST_WIDE_INT size = int_size_in_bytes (type);
+
+ if (AGGREGATE_TYPE_P (type))
+ return true;
+
+ if (mode == BLKmode)
+ return true;
+
+ if (size > 2 * UNITS_PER_WORD)
+ return true;
+
+ return false;
+}
+
+/* Implement TARGET_PROMOTE_FUNCTION_MODE.
+
+ Return the mode to use for outgoing function arguments. */
+
+machine_mode
+gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
+ int *ARG_UNUSED (punsignedp),
+ const_tree ARG_UNUSED (funtype),
+ int ARG_UNUSED (for_return))
+{
+ if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
+ return SImode;
+
+ return mode;
+}
+
+/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
+
+ Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
+ ARGS_GROW_DOWNWARDS. */
+
+static tree
+gcn_gimplify_va_arg_expr (tree valist, tree type,
+ gimple_seq *ARG_UNUSED (pre_p),
+ gimple_seq *ARG_UNUSED (post_p))
+{
+ tree ptr = build_pointer_type (type);
+ tree valist_type;
+ tree t, u;
+ bool indirect;
+
+ indirect = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
+ if (indirect)
+ {
+ type = ptr;
+ ptr = build_pointer_type (type);
+ }
+ valist_type = TREE_TYPE (valist);
+
+ /* Args grow down. Not handled by generic routines. */
+
+ u = fold_convert (sizetype, size_in_bytes (type));
+ u = fold_build1 (NEGATE_EXPR, sizetype, u);
+ t = fold_build_pointer_plus (valist, u);
+
+ /* Align to 8 byte boundary. */
+
+ u = build_int_cst (TREE_TYPE (t), -8);
+ t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
+ t = fold_convert (valist_type, t);
+
+ t = build2 (MODIFY_EXPR, valist_type, valist, t);
+
+ t = fold_convert (ptr, t);
+ t = build_va_arg_indirect_ref (t);
+
+ if (indirect)
+ t = build_va_arg_indirect_ref (t);
+
+ return t;
+}
+
+/* Calculate stack offsets needed to create prologues and epilogues. */
+
+static struct machine_function *
+gcn_compute_frame_offsets (void)
+{
+ machine_function *offsets = cfun->machine;
+
+ if (reload_completed)
+ return offsets;
+
+ offsets->need_frame_pointer = frame_pointer_needed;
+
+ offsets->outgoing_args_size = crtl->outgoing_args_size;
+ offsets->pretend_size = crtl->args.pretend_args_size;
+
+ offsets->local_vars = get_frame_size ();
+
+ offsets->lr_needs_saving = (!leaf_function_p ()
+ || df_regs_ever_live_p (LR_REGNUM)
+ || df_regs_ever_live_p (LR_REGNUM + 1));
+
+ offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
+
+ for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if ((df_regs_ever_live_p (regno) && !call_used_regs[regno])
+ || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
+ && frame_pointer_needed))
+ offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
+
+ /* Round up to 64-bit boundary to maintain stack alignment. */
+ offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
+
+ return offsets;
+}
+
+/* Insert code into the prologue or epilogue to store or load any
+ callee-save register to/from the stack.
+
+ Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
+
+static void
+move_callee_saved_registers (rtx sp, machine_function *offsets,
+ bool prologue)
+{
+ int regno, offset, saved_scalars;
+ rtx exec = gen_rtx_REG (DImode, EXEC_REG);
+ rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
+ rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
+ rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
+ HOST_WIDE_INT exec_set = 0;
+ int offreg_set = 0;
+
+ start_sequence ();
+
+ /* Move scalars into two vector registers. */
+ for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
+ if ((df_regs_ever_live_p (regno) && !call_used_regs[regno])
+ || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
+ || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
+ && offsets->need_frame_pointer))
+ {
+ rtx reg = gen_rtx_REG (SImode, regno);
+ rtx vreg = gen_rtx_REG (V64SImode,
+ VGPR_REGNO (6 + (saved_scalars / 64)));
+ int lane = saved_scalars % 64;
+
+ if (prologue)
+ emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
+ else
+ emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
+
+ saved_scalars++;
+ }
+
+ rtx move_scalars = get_insns ();
+ end_sequence ();
+ start_sequence ();
+
+ /* Ensure that all vector lanes are moved. */
+ exec_set = -1;
+ emit_move_insn (exec, GEN_INT (exec_set));
+
+ /* Set up a vector stack pointer. */
+ rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
+ emit_insn (gen_ashlv64si3_vector (_0_4_8_12, _0_1_2_3, GEN_INT (2), exec,
+ gcn_gen_undef (V64SImode)));
+ rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
+ emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, exec,
+ gcn_gen_undef (V64DImode)));
+ emit_insn (gen_addv64si3_vector_vcc (gcn_operand_part (V64SImode, vsp, 0),
+ gcn_operand_part (V64SImode, vsp, 0),
+ _0_4_8_12, exec,
+ gcn_gen_undef (V64SImode), vcc,
+ gcn_gen_undef (DImode)));
+ emit_insn (gen_addcv64si3_vec (gcn_operand_part (V64SImode, vsp, 1),
+ gcn_operand_part (V64SImode, vsp, 1),
+ const0_rtx, exec, gcn_gen_undef (V64SImode),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+
+ /* Move vectors. */
+ for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size;
+ regno < FIRST_PSEUDO_REGISTER; regno++)
+ if ((df_regs_ever_live_p (regno) && !call_used_regs[regno])
+ || (regno == VGPR_REGNO (6) && saved_scalars > 0)
+ || (regno == VGPR_REGNO (7) && saved_scalars > 63))
+ {
+ rtx reg = gen_rtx_REG (V64SImode, regno);
+ int size = 256;
+
+ if (regno == VGPR_REGNO (6) && saved_scalars < 64)
+ size = saved_scalars * 4;
+ else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
+ size = (saved_scalars - 64) * 4;
+
+ if (size != 256 || exec_set != -1)
+ {
+ exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
+ emit_move_insn (exec, gen_int_mode (exec_set, DImode));
+ }
+
+ if (prologue)
+ emit_insn (gen_scatterv64si_insn_1offset (vsp, const0_rtx, reg, as,
+ const0_rtx, exec));
+ else
+ emit_insn (gen_gatherv64si_insn_1offset (reg, vsp, const0_rtx, as,
+ const0_rtx,
+ gcn_gen_undef (V64SImode),
+ exec));
+
+ /* Move our VSP to the next stack entry. */
+ if (offreg_set != size)
+ {
+ offreg_set = size;
+ emit_move_insn (offreg, GEN_INT (size));
+ }
+ if (exec_set != -1)
+ {
+ exec_set = -1;
+ emit_move_insn (exec, GEN_INT (exec_set));
+ }
+ emit_insn (gen_addv64si3_vector_vcc_dup
+ (gcn_operand_part (V64SImode, vsp, 0),
+ gcn_operand_part (V64SImode, vsp, 0),
+ offreg, exec, gcn_gen_undef (V64SImode),
+ vcc, gcn_gen_undef (DImode)));
+ emit_insn (gen_addcv64si3_vec
+ (gcn_operand_part (V64SImode, vsp, 1),
+ gcn_operand_part (V64SImode, vsp, 1),
+ const0_rtx, exec, gcn_gen_undef (V64SImode),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0), gcn_gen_undef (DImode)));
+
+ offset += size;
+ }
+
+ rtx move_vectors = get_insns ();
+ end_sequence ();
+
+ if (prologue)
+ {
+ emit_insn (move_scalars);
+ emit_insn (move_vectors);
+ }
+ else
+ {
+ emit_insn (move_vectors);
+ emit_insn (move_scalars);
+ }
+}
+
+/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
+
+ For a non-kernel function, the stack layout looks like this (interim),
+ growing *upwards*:
+
+ hi | + ...
+ |__________________| <-- current SP
+ | outgoing args |
+ |__________________|
+ | (alloca space) |
+ |__________________|
+ | local vars |
+ |__________________| <-- FP/hard FP
+ | callee-save regs |
+ |__________________| <-- soft arg pointer
+ | pretend args |
+ |__________________| <-- incoming SP
+ | incoming args |
+ lo |..................|
+
+ This implies arguments (beyond the first N in registers) must grow
+ downwards (as, apparently, PA has them do).
+
+ For a kernel function we have the simpler:
+
+ hi | + ...
+ |__________________| <-- current SP
+ | outgoing args |
+ |__________________|
+ | (alloca space) |
+ |__________________|
+ | local vars |
+ lo |__________________| <-- FP/hard FP
+
+*/
+
+void
+gcn_expand_prologue ()
+{
+ machine_function *offsets = gcn_compute_frame_offsets ();
+
+ if (!cfun || !cfun->machine || cfun->machine->normal_function)
+ {
+ rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
+ rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
+
+ start_sequence ();
+
+ if (offsets->pretend_size > 0)
+ {
+ /* FIXME: Do the actual saving of register pretend args to the stack.
+ Register order needs consideration. */
+ }
+
+ /* Save callee-save regs. */
+ move_callee_saved_registers (sp, offsets, true);
+
+ HOST_WIDE_INT sp_adjust = offsets->pretend_size
+ + offsets->callee_saves
+ + offsets->local_vars + offsets->outgoing_args_size;
+ if (sp_adjust > 0)
+ emit_insn (gen_adddi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
+
+ if (offsets->need_frame_pointer)
+ emit_insn (gen_adddi3 (fp, sp,
+ gen_int_mode (-(offsets->local_vars +
+ offsets->outgoing_args_size),
+ DImode)));
+
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+
+ /* FIXME: Prologue insns should have this flag set for debug output, etc.
+ but it causes issues for now.
+ for (insn = seq; insn; insn = NEXT_INSN (insn))
+ if (INSN_P (insn))
+ RTX_FRAME_RELATED_P (insn) = 1;*/
+
+ emit_insn (seq);
+ }
+ else
+ {
+ rtx wave_offset = gen_rtx_REG (SImode,
+ cfun->machine->args.
+ reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
+
+ if (TARGET_GCN5_PLUS)
+ {
+ /* v0 is reserved for constant zero so that "global"
+ memory instructions can have a nul-offset without
+ causing reloads. */
+ rtx exec = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (exec, GEN_INT (-1));
+ emit_insn (gen_vec_duplicatev64si_exec
+ (gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+ const0_rtx, exec, gcn_gen_undef (V64SImode)));
+ }
+
+ if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
+ {
+ rtx fs_init_lo =
+ gen_rtx_REG (SImode,
+ cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
+ rtx fs_init_hi =
+ gen_rtx_REG (SImode,
+ cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
+ rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
+ rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
+
+ /*rtx queue = gen_rtx_REG(DImode,
+ cfun->machine->args.reg[QUEUE_PTR_ARG]);
+ rtx aperture = gen_rtx_MEM (SImode,
+ gen_rtx_PLUS (DImode, queue,
+ gen_int_mode (68, SImode)));
+ set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
+
+ /* Set up flat_scratch. */
+ emit_insn (gen_addsi3 (fs_reg_hi, fs_init_lo, wave_offset));
+ emit_insn (gen_lshrsi3_scalar (fs_reg_hi, fs_reg_hi,
+ gen_int_mode (8, SImode)));
+ emit_move_insn (fs_reg_lo, fs_init_hi);
+ }
+
+ /* Set up frame pointer and stack pointer. */
+ rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
+ rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
+ rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
+ rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
+
+ HOST_WIDE_INT sp_adjust = (offsets->local_vars
+ + offsets->outgoing_args_size);
+
+ /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
+ emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
+ emit_insn (gen_andsi3 (fp_hi, gen_rtx_REG (SImode, 1),
+ gen_int_mode (0xffff, SImode)));
+ emit_insn (gen_addsi3 (fp_lo, fp_lo, wave_offset));
+ emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi,
+ gen_rtx_REG (BImode, SCC_REG)));
+
+ if (sp_adjust > 0)
+ emit_insn (gen_adddi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
+ else
+ emit_move_insn (sp, fp);
+
+ /* Make sure the flat scratch reg doesn't get optimised away. */
+ emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
+ }
+
+ emit_move_insn (gen_rtx_REG (SImode, M0_REG),
+ gen_int_mode (LDS_SIZE, SImode));
+
+ emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
+ if (TARGET_GCN5_PLUS)
+ emit_insn (gen_prologue_use (gen_rtx_REG (SImode, VGPR_REGNO (0))));
+
+ if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
+ {
+ /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
+ rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
+ emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
+ "gomp_gcn_enter_kernel"));
+ emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
+ }
+}
+
+/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
+
+ See gcn_expand_prologue for stack details. */
+
+void
+gcn_expand_epilogue (void)
+{
+ if (!cfun || !cfun->machine || cfun->machine->normal_function)
+ {
+ machine_function *offsets = gcn_compute_frame_offsets ();
+ rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
+ rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
+
+ HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
+
+ if (offsets->need_frame_pointer)
+ {
+ /* Restore old SP from the frame pointer. */
+ if (sp_adjust > 0)
+ emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
+ else
+ emit_move_insn (sp, fp);
+ }
+ else
+ {
+ /* Restore old SP from current SP. */
+ sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
+
+ if (sp_adjust > 0)
+ emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
+ }
+
+ move_callee_saved_registers (sp, offsets, false);
+
+ /* There's no explicit use of the link register on the return insn. Emit
+ one here instead. */
+ if (offsets->lr_needs_saving)
+ emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
+
+ /* Similar for frame pointer. */
+ if (offsets->need_frame_pointer)
+ emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
+ }
+ else if (flag_openmp)
+ {
+ /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
+ rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
+ emit_move_insn (fn_reg,
+ gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
+ emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
+ }
+ else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
+ {
+ /* Assume that an exit value compatible with gcn-run is expected.
+ That is, the third input parameter is an int*.
+
+ We can't allocate any new registers, but the kernarg_reg is
+ dead after this, so we'll use that. */
+ rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+ [KERNARG_SEGMENT_PTR_ARG]);
+ rtx retptr_mem = gen_rtx_MEM (DImode,
+ gen_rtx_PLUS (DImode, kernarg_reg,
+ GEN_INT (16)));
+ set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
+ emit_move_insn (kernarg_reg, retptr_mem);
+
+ rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
+ set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
+ emit_move_insn (retval_mem,
+ gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
+ }
+
+ emit_jump_insn (gen_gcn_return ());
+}
+
+/* Implement TARGET_CAN_ELIMINATE.
+
+ Return true if the compiler is allowed to try to replace register number
+ FROM_REG with register number TO_REG.
+
+ FIXME: is the default "true" not enough? Should this be a negative set? */
+
+bool
+gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
+{
+ return (to_reg == HARD_FRAME_POINTER_REGNUM
+ || to_reg == STACK_POINTER_REGNUM);
+}
+
+/* Implement INITIAL_ELIMINATION_OFFSET.
+
+ Returns the initial difference between the specified pair of registers, in
+ terms of stack position. */
+
+HOST_WIDE_INT
+gcn_initial_elimination_offset (int from, int to)
+{
+ machine_function *offsets = gcn_compute_frame_offsets ();
+
+ switch (from)
+ {
+ case ARG_POINTER_REGNUM:
+ if (to == STACK_POINTER_REGNUM)
+ return -(offsets->callee_saves + offsets->local_vars
+ + offsets->outgoing_args_size);
+ else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
+ return -offsets->callee_saves;
+ else
+ gcc_unreachable ();
+ break;
+
+ case FRAME_POINTER_REGNUM:
+ if (to == STACK_POINTER_REGNUM)
+ return -(offsets->local_vars + offsets->outgoing_args_size);
+ else if (to == HARD_FRAME_POINTER_REGNUM)
+ return 0;
+ else
+ gcc_unreachable ();
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Implement HARD_REGNO_RENAME_OK.
+
+ Return true if it is permissible to rename a hard register from
+ FROM_REG to TO_REG. */
+
+bool
+gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
+{
+ if (SPECIAL_REGNO_P (from_reg) || SPECIAL_REGNO_P (to_reg))
+ return false;
+
+ /* Allow the link register to be used if it was saved. */
+ if ((to_reg & ~1) == LINK_REGNUM)
+ return !cfun || cfun->machine->lr_needs_saving;
+
+ /* Allow the registers used for the static chain to be used if the chain is
+ not in active use. */
+ if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
+ return !cfun
+ || !(cfun->static_chain_decl
+ && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
+ && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
+
+ return true;
+}
+
+/* Implement HARD_REGNO_CALLER_SAVE_MODE.
+
+ Which mode is required for saving NREGS of a pseudo-register in
+ call-clobbered hard register REGNO. */
+
+machine_mode
+gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
+ machine_mode regmode)
+{
+ machine_mode result = choose_hard_reg_mode (regno, nregs, false);
+
+ if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
+ result = (nregs == 1 ? SImode : DImode);
+
+ return result;
+}
+
+/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
+
+ Output assembler code for a block containing the constant parts
+ of a trampoline, leaving space for the variable parts. */
+
+static void
+gcn_asm_trampoline_template (FILE *f)
+{
+ /* The source operand of the move instructions must be a 32-bit
+ constant following the opcode. */
+ asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
+ asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
+ asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
+ asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
+ asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
+}
+
+/* Implement TARGET_TRAMPOLINE_INIT.
+
+ Emit RTL insns to initialize the variable parts of a trampoline.
+ FNDECL is the decl of the target address, M_TRAMP is a MEM for
+ the trampoline, and CHAIN_VALUE is an RTX for the static chain
+ to be passed to the target function. */
+
+static void
+gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
+{
+ emit_block_move (m_tramp, assemble_trampoline_template (),
+ GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
+
+ rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
+ rtx chain_value_reg = copy_to_reg (chain_value);
+ rtx fnaddr_reg = copy_to_reg (fnaddr);
+
+ for (int i = 0; i < 4; i++)
+ {
+ rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
+ rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
+ emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
+ }
+
+ rtx tramp_addr = XEXP (m_tramp, 0);
+ emit_insn (gen_clear_icache (tramp_addr,
+ plus_constant (ptr_mode, tramp_addr,
+ TRAMPOLINE_SIZE)));
+}
+
+/* }}} */
+/* {{{ Miscellaneous. */
+
+/* Implement TARGET_CANNOT_COPY_INSN_P.
+
+ Return true if INSN must not be duplicated. */
+
+static bool
+gcn_cannot_copy_insn_p (rtx_insn *insn)
+{
+ if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
+ return true;
+
+ return false;
+}
+
+/* Implement TARGET_DEBUG_UNWIND_INFO.
+
+ Defines the mechanism that will be used for describing frame unwind
+ information to the debugger. */
+
+static enum unwind_info_type
+gcn_debug_unwind_info ()
+{
+ /* No support for debug info, yet. */
+ return UI_NONE;
+}
+
+/* Determine if there is a suitable hardware conversion instruction.
+ Used primarily by the machine description. */
+
+bool
+gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
+{
+ if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
+ return false;
+
+ if (VECTOR_MODE_P (from))
+ {
+ from = GET_MODE_INNER (from);
+ to = GET_MODE_INNER (to);
+ }
+
+ switch (op)
+ {
+ case fix_trunc_cvt:
+ case fixuns_trunc_cvt:
+ if (GET_MODE_CLASS (from) != MODE_FLOAT
+ || GET_MODE_CLASS (to) != MODE_INT)
+ return false;
+ break;
+ case float_cvt:
+ case floatuns_cvt:
+ if (GET_MODE_CLASS (from) != MODE_INT
+ || GET_MODE_CLASS (to) != MODE_FLOAT)
+ return false;
+ break;
+ case extend_cvt:
+ if (GET_MODE_CLASS (from) != MODE_FLOAT
+ || GET_MODE_CLASS (to) != MODE_FLOAT
+ || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
+ return false;
+ break;
+ case trunc_cvt:
+ if (GET_MODE_CLASS (from) != MODE_FLOAT
+ || GET_MODE_CLASS (to) != MODE_FLOAT
+ || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
+ return false;
+ break;
+ }
+
+ return ((to == HImode && from == HFmode)
+ || (to == SImode && (from == SFmode || from == DFmode))
+ || (to == HFmode && (from == HImode || from == SFmode))
+ || (to == SFmode && (from == SImode || from == HFmode
+ || from == DFmode))
+ || (to == DFmode && (from == SImode || from == SFmode)));
+}
+
+/* Implement TARGET_LEGITIMATE_COMBINED_INSN.
+
+ Return false if the instruction is not appropriate as a combination of two
+ or more instructions. */
+
+bool
+gcn_legitimate_combined_insn (rtx_insn *insn)
+{
+ rtx pat = PATTERN (insn);
+
+ /* The combine pass tends to strip (use (exec)) patterns from insns. This
+ means it basically switches everything to use the *_scalar form of the
+ instructions, which is not helpful. So, this function disallows such
+ combinations. Unfortunately, this also disallows combinations of genuine
+ scalar-only patterns, but those only come from explicit expand code.
+
+ Possible solutions:
+ - Invent TARGET_LEGITIMIZE_COMBINED_INSN.
+ - Remove all (use (EXEC)) and rely on md_reorg with "exec" attribute.
+ */
+
+ switch (GET_CODE (pat))
+ {
+ case SET:
+ /* Vector mode patterns are fine. */
+ if (VECTOR_MODE_P (GET_MODE (XEXP (pat, 0))))
+ return true;
+ /* Plain moves are fine (fixed up by md_reorg). */
+ switch (GET_CODE (XEXP (pat, 1)))
+ {
+ case REG:
+ case SUBREG:
+ case MEM:
+ return true;
+ default:
+ /* Any other scalar operation should have been a parallel. */
+ return false;
+ }
+ case PARALLEL:
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ {
+ rtx subpat = XVECEXP (pat, 0, i);
+ switch (GET_CODE (subpat))
+ {
+ case USE:
+ /* FIXME: check it really is EXEC that is used.
+ Does combine ever generate a pattern with a use? */
+ return true;
+ case SET:
+ /* Vector mode patterns are fine. */
+ if (VECTOR_MODE_P (GET_MODE (XEXP (pat, 0))))
+ return true;
+ default:
+ break;
+ }
+ }
+ /* A suitable pattern was not found. */
+ return false;
+ default:
+ return true;
+ }
+}
+
+/* Implement both TARGET_ASM_CONSTRUCTOR and TARGET_ASM_DESTRUCTOR.
+
+ The current loader does not support running code outside "main". This
+ hook implementation can be replaced or removed when that changes. */
+
+void
+gcn_disable_constructors (rtx symbol, int priority __attribute__ ((unused)))
+{
+ tree d = SYMBOL_REF_DECL (symbol);
+ location_t l = d ? DECL_SOURCE_LOCATION (d) : UNKNOWN_LOCATION;
+
+ sorry_at (l, "GCN does not support static constructors or destructors");
+}
+
+/* }}} */
+/* {{{ Costs. */
+
+/* Implement TARGET_RTX_COSTS.
+
+ Compute a (partial) cost for rtx X. Return true if the complete
+ cost has been computed, and false if subexpressions should be
+ scanned. In either case, *TOTAL contains the cost result. */
+
+static bool
+gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
+{
+ enum rtx_code code = GET_CODE (x);
+ switch (code)
+ {
+ case CONST:
+ case CONST_DOUBLE:
+ case CONST_VECTOR:
+ case CONST_INT:
+ if (gcn_inline_constant_p (x))
+ *total = 0;
+ else if (code == CONST_INT
+ && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
+ *total = 1;
+ else if (gcn_constant_p (x))
+ *total = 2;
+ else
+ *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
+ return true;
+
+ case DIV:
+ *total = 100;
+ return false;
+
+ default:
+ *total = 3;
+ return false;
+ }
+}
+
+/* Implement TARGET_MEMORY_MOVE_COST.
+
+ Return the cost of moving data of mode M between a
+ register and memory. A value of 2 is the default; this cost is
+ relative to those in `REGISTER_MOVE_COST'.
+
+ This function is used extensively by register_move_cost that is used to
+ build tables at startup. Make it inline in this case.
+ When IN is 2, return maximum of in and out move cost.
+
+ If moving between registers and memory is more expensive than
+ between two registers, you should define this macro to express the
+ relative cost.
+
+ Model also increased moving costs of QImode registers in non
+ Q_REGS classes. */
+
+#define LOAD_COST 32
+#define STORE_COST 32
+static int
+gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
+{
+ int nregs = CEIL (GET_MODE_SIZE (mode), 4);
+ switch (regclass)
+ {
+ case SCC_CONDITIONAL_REG:
+ case VCCZ_CONDITIONAL_REG:
+ case VCC_CONDITIONAL_REG:
+ case EXECZ_CONDITIONAL_REG:
+ case ALL_CONDITIONAL_REGS:
+ case SGPR_REGS:
+ case SGPR_EXEC_REGS:
+ case EXEC_MASK_REG:
+ case SGPR_VOP3A_SRC_REGS:
+ case SGPR_MEM_SRC_REGS:
+ case SGPR_SRC_REGS:
+ case SGPR_DST_REGS:
+ case GENERAL_REGS:
+ case AFP_REGS:
+ if (!in)
+ return (STORE_COST + 2) * nregs;
+ return LOAD_COST * nregs;
+ case VGPR_REGS:
+ if (in)
+ return (LOAD_COST + 2) * nregs;
+ return STORE_COST * nregs;
+ case ALL_REGS:
+ case ALL_GPR_REGS:
+ case SRCDST_REGS:
+ if (in)
+ return (LOAD_COST + 2) * nregs;
+ return (STORE_COST + 2) * nregs;
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Implement TARGET_REGISTER_MOVE_COST.
+
+ Return the cost of moving data from a register in class CLASS1 to
+ one in class CLASS2. Base value is 2. */
+
+static int
+gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
+{
+ /* Increase cost of moving from and to vector registers. While this is
+ fast in hardware (I think), it has hidden cost of setting up the exec
+ flags. */
+ if ((src < VGPR_REGS) != (dst < VGPR_REGS))
+ return 4;
+ return 2;
+}
+
+/* }}} */
+/* {{{ Builtins. */
+
+/* Type codes used by GCN built-in definitions. */
+
+enum gcn_builtin_type_index
+{
+ GCN_BTI_END_OF_PARAMS,
+
+ GCN_BTI_VOID,
+ GCN_BTI_BOOL,
+ GCN_BTI_INT,
+ GCN_BTI_UINT,
+ GCN_BTI_SIZE_T,
+ GCN_BTI_LLINT,
+ GCN_BTI_LLUINT,
+ GCN_BTI_EXEC,
+
+ GCN_BTI_SF,
+ GCN_BTI_V64SI,
+ GCN_BTI_V64SF,
+ GCN_BTI_V64PTR,
+ GCN_BTI_SIPTR,
+ GCN_BTI_SFPTR,
+ GCN_BTI_VOIDPTR,
+
+ GCN_BTI_LDS_VOIDPTR,
+
+ GCN_BTI_MAX
+};
+
+static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
+
+#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
+#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
+#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
+#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
+#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
+#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
+#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
+#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
+#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
+
+static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
+ struct gcn_builtin_description *);
+static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
+ struct gcn_builtin_description *);
+
+struct gcn_builtin_description;
+typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
+ struct gcn_builtin_description *);
+
+enum gcn_builtin_type
+{
+ B_UNIMPLEMENTED, /* Sorry out */
+ B_INSN, /* Emit a pattern */
+ B_OVERLOAD /* Placeholder for an overloaded function */
+};
+
+struct gcn_builtin_description
+{
+ int fcode;
+ int icode;
+ const char *name;
+ enum gcn_builtin_type type;
+ /* The first element of parm is always the return type. The rest
+ are a zero terminated list of parameters. */
+ int parm[6];
+ gcn_builtin_expander expander;
+};
+
+/* Read in the GCN builtins from gcn-builtins.def. */
+
+extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
+
+struct gcn_builtin_description gcn_builtins[] = {
+#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
+ {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
+
+#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
+ {GCN_BUILTIN_ ## fcode ## _V64SI, \
+ CODE_FOR_ ## ic ##v64si3_vector, name "_v64int", B_INSN, \
+ {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
+ GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
+ {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
+ CODE_FOR_ ## ic ##v64si3_vector, name "_v64int_unspec", B_INSN, \
+ {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
+ GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
+
+#include "gcn-builtins.def"
+#undef DEF_BUILTIN_BINOP_INT_FP
+#undef DEF_BUILTIN
+};
+
+static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
+
+/* Implement TARGET_BUILTIN_DECL.
+
+ Return the GCN builtin for CODE. */
+
+tree
+gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
+{
+ if (code >= GCN_BUILTIN_MAX)
+ return error_mark_node;
+
+ return gcn_builtin_decls[code];
+}
+
+/* Helper function for gcn_init_builtins. */
+
+static void
+gcn_init_builtin_types (void)
+{
+ gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
+ gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
+ gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
+ gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
+ gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
+ gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
+ gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
+
+ exec_type_node = unsigned_intDI_type_node;
+ sf_type_node = float32_type_node;
+ v64si_type_node = build_vector_type (intSI_type_node, 64);
+ v64sf_type_node = build_vector_type (float_type_node, 64);
+ v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
+ /*build_pointer_type
+ (integer_type_node) */
+ , 64);
+ tree tmp = build_distinct_type_copy (intSI_type_node);
+ TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
+ siptr_type_node = build_pointer_type (tmp);
+
+ tmp = build_distinct_type_copy (float_type_node);
+ TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
+ sfptr_type_node = build_pointer_type (tmp);
+
+ tmp = build_distinct_type_copy (void_type_node);
+ TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
+ voidptr_type_node = build_pointer_type (tmp);
+
+ tmp = build_distinct_type_copy (void_type_node);
+ TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
+ gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
+}
+
+/* Implement TARGET_INIT_BUILTINS.
+
+ Set up all builtin functions for this target. */
+
+static void
+gcn_init_builtins (void)
+{
+ gcn_init_builtin_types ();
+
+ struct gcn_builtin_description *d;
+ unsigned int i;
+ for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
+ {
+ tree p;
+ char name[64]; /* build_function will make a copy. */
+ int parm;
+
+ /* FIXME: Is this necessary/useful? */
+ if (d->name == 0)
+ continue;
+
+ /* Find last parm. */
+ for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
+ ;
+
+ p = void_list_node;
+ while (parm > 1)
+ p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
+
+ p = build_function_type (gcn_builtin_types[d->parm[0]], p);
+
+ sprintf (name, "__builtin_gcn_%s", d->name);
+ gcn_builtin_decls[i]
+ = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
+
+ /* These builtins don't throw. */
+ TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
+ }
+
+/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
+#ifdef BUILT_IN_GOACC_SINGLE_START
+ /* These builtins need to take/return an LDS pointer: override the generic
+ versions here. */
+
+ set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
+ gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
+
+ set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
+ gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
+ false);
+
+ set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
+ gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
+ false);
+
+ set_builtin_decl (BUILT_IN_GOACC_BARRIER,
+ gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
+#endif
+}
+
+/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
+ not require taking the address of any object, other than the memory
+ cell being operated on.
+
+ Helper function for gcn_expand_builtin_1. */
+
+static rtx
+gcn_expand_cmp_swap (tree exp, rtx target)
+{
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+ addr_space_t as
+ = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
+ machine_mode as_mode = gcn_addr_space_address_mode (as);
+
+ if (!target)
+ target = gen_reg_rtx (mode);
+
+ rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
+ NULL_RTX, as_mode, EXPAND_NORMAL);
+ rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
+ NULL_RTX, mode, EXPAND_NORMAL);
+ rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
+ NULL_RTX, mode, EXPAND_NORMAL);
+ rtx pat;
+
+ rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
+ set_mem_addr_space (mem, as);
+
+ if (!REG_P (cmp))
+ cmp = copy_to_mode_reg (mode, cmp);
+ if (!REG_P (src))
+ src = copy_to_mode_reg (mode, src);
+
+ if (mode == SImode)
+ pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
+ else
+ pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
+
+ emit_insn (pat);
+
+ return target;
+}
+
+/* Expand many different builtins.
+
+ Intended for use in gcn-builtins.def. */
+
+static rtx
+gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
+ machine_mode /*mode */ , int ignore,
+ struct gcn_builtin_description *)
+{
+ tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+ switch (DECL_FUNCTION_CODE (fndecl))
+ {
+ case GCN_BUILTIN_FLAT_LOAD_INT32:
+ {
+ if (ignore)
+ return target;
+ /*rtx exec = */
+ force_reg (DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
+ EXPAND_NORMAL));
+ /*rtx ptr = */
+ force_reg (V64DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
+ EXPAND_NORMAL));
+ /*emit_insn (gen_vector_flat_loadv64si
+ (target, gcn_gen_undef (V64SImode), ptr, exec)); */
+ return target;
+ }
+ case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
+ case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
+ {
+ if (ignore)
+ return target;
+ rtx exec = force_reg (DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ DImode,
+ EXPAND_NORMAL));
+ rtx ptr = force_reg (DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
+ V64DImode,
+ EXPAND_NORMAL));
+ rtx offsets = force_reg (V64SImode,
+ expand_expr (CALL_EXPR_ARG (exp, 2),
+ NULL_RTX, V64DImode,
+ EXPAND_NORMAL));
+ rtx addrs = gen_reg_rtx (V64DImode);
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_ashlv64si3_vector (tmp, offsets,
+ GEN_INT (2),
+ exec, gcn_gen_undef (V64SImode)));
+ emit_insn (gen_addv64di3_zext_dup2 (addrs, tmp, ptr, exec,
+ gcn_gen_undef (V64DImode)));
+ rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
+ /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
+ /* FIXME: set attributes. */
+ emit_insn (gen_mov_with_exec (target, mem, exec));
+ return target;
+ }
+ case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
+ case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
+ {
+ rtx exec = force_reg (DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ DImode,
+ EXPAND_NORMAL));
+ rtx ptr = force_reg (DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
+ V64DImode,
+ EXPAND_NORMAL));
+ rtx offsets = force_reg (V64SImode,
+ expand_expr (CALL_EXPR_ARG (exp, 2),
+ NULL_RTX, V64DImode,
+ EXPAND_NORMAL));
+ machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
+ 3)));
+ rtx val = force_reg (vmode,
+ expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
+ vmode,
+ EXPAND_NORMAL));
+ rtx addrs = gen_reg_rtx (V64DImode);
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_ashlv64si3_vector (tmp, offsets,
+ GEN_INT (2),
+ exec, gcn_gen_undef (V64SImode)));
+ emit_insn (gen_addv64di3_zext_dup2 (addrs, tmp, ptr, exec,
+ gcn_gen_undef (V64DImode)));
+ rtx mem = gen_rtx_MEM (vmode, addrs);
+ /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
+ /* FIXME: set attributes. */
+ emit_insn (gen_mov_with_exec (mem, val, exec));
+ return target;
+ }
+ case GCN_BUILTIN_SQRTVF:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_full_exec_reg ();
+ rtx arg = force_reg (V64SFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ V64SFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_sqrtv64sf_vector
+ (target, arg, exec, gcn_gen_undef (V64SFmode)));
+ return target;
+ }
+ case GCN_BUILTIN_SQRTF:
+ {
+ if (ignore)
+ return target;
+ rtx exec = gcn_scalar_exec ();
+ rtx arg = force_reg (SFmode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ SFmode,
+ EXPAND_NORMAL));
+ emit_insn (gen_sqrtsf_scalar (target, arg, exec));
+ return target;
+ }
+ case GCN_BUILTIN_OMP_DIM_SIZE:
+ {
+ if (ignore)
+ return target;
+ emit_insn (gen_oacc_dim_size (target,
+ expand_expr (CALL_EXPR_ARG (exp, 0),
+ NULL_RTX, SImode,
+ EXPAND_NORMAL)));
+ return target;
+ }
+ case GCN_BUILTIN_OMP_DIM_POS:
+ {
+ if (ignore)
+ return target;
+ emit_insn (gen_oacc_dim_pos (target,
+ expand_expr (CALL_EXPR_ARG (exp, 0),
+ NULL_RTX, SImode,
+ EXPAND_NORMAL)));
+ return target;
+ }
+ case GCN_BUILTIN_CMP_SWAP:
+ case GCN_BUILTIN_CMP_SWAPLL:
+ return gcn_expand_cmp_swap (exp, target);
+
+ case GCN_BUILTIN_ACC_SINGLE_START:
+ {
+ if (ignore)
+ return target;
+
+ rtx wavefront = gcn_oacc_dim_pos (1);
+ rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
+ rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
+ emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
+ return cc;
+ }
+
+ case GCN_BUILTIN_ACC_SINGLE_COPY_START:
+ {
+ rtx blk = force_reg (SImode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
+ SImode, EXPAND_NORMAL));
+ rtx wavefront = gcn_oacc_dim_pos (1);
+ rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
+ rtx not_zero = gen_label_rtx ();
+ emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
+ emit_move_insn (blk, const0_rtx);
+ emit_label (not_zero);
+ return blk;
+ }
+
+ case GCN_BUILTIN_ACC_SINGLE_COPY_END:
+ return target;
+
+ case GCN_BUILTIN_ACC_BARRIER:
+ emit_insn (gen_gcn_wavefront_barrier ());
+ return target;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Expansion of simple arithmetic and bit binary operation builtins.
+
+ Intended for use with gcn_builtins table. */
+
+static rtx
+gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
+ machine_mode /*mode */ , int ignore,
+ struct gcn_builtin_description *d)
+{
+ int icode = d->icode;
+ if (ignore)
+ return target;
+
+ rtx exec = force_reg (DImode,
+ expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
+ EXPAND_NORMAL));
+
+ machine_mode m1 = insn_data[icode].operand[1].mode;
+ rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
+ EXPAND_NORMAL);
+ if (!insn_data[icode].operand[1].predicate (arg1, m1))
+ arg1 = force_reg (m1, arg1);
+
+ machine_mode m2 = insn_data[icode].operand[2].mode;
+ rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
+ EXPAND_NORMAL);
+ if (!insn_data[icode].operand[2].predicate (arg2, m2))
+ arg2 = force_reg (m2, arg2);
+
+ rtx arg_prev;
+ if (call_expr_nargs (exp) == 4)
+ {
+ machine_mode m_prev = insn_data[icode].operand[4].mode;
+ arg_prev = force_reg (m_prev,
+ expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
+ m_prev, EXPAND_NORMAL));
+ }
+ else
+ arg_prev = gcn_gen_undef (GET_MODE (target));
+
+ rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
+ emit_insn (pat);
+ return target;
+}
+
+/* Implement TARGET_EXPAND_BUILTIN.
+
+ Expand an expression EXP that calls a built-in function, with result going
+ to TARGET if that's convenient (and in mode MODE if that's convenient).
+ SUBTARGET may be used as the target for computing one of EXP's operands.
+ IGNORE is nonzero if the value is to be ignored. */
+
+rtx
+gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
+ int ignore)
+{
+ tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+ unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+ struct gcn_builtin_description *d;
+
+ gcc_assert (fcode < GCN_BUILTIN_MAX);
+ d = &gcn_builtins[fcode];
+
+ if (d->type == B_UNIMPLEMENTED)
+ sorry ("Builtin not implemented");
+
+ return d->expander (exp, target, subtarget, mode, ignore, d);
+}
+
+/* }}} */
+/* {{{ Vectorization. */
+
+/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
+
+ The autovectorizer requires a vector type for the "truth vector" used for
+ fully masked loops, etc. Therefore we must have minimal support for V64BI
+ mode (enough to enable the middle-end optimizations).
+
+ We convert V64BImode to DImode at expand time. */
+
+opt_machine_mode
+gcn_vectorize_get_mask_mode (poly_uint64 ARG_UNUSED (nunits),
+ poly_uint64 ARG_UNUSED (length))
+{
+ return V64BImode;
+}
+
+/* Convert vector mask mode.
+
+ The autovectorizer uses a V64BImode mask, but the backend uses DImode.
+ This is intended to convert the mask mode during the expand pass only. */
+
+rtx
+gcn_convert_mask_mode (rtx x)
+{
+ gcc_assert (GET_MODE (x) == V64BImode);
+
+ if (REG_P (x) || SUBREG_P (x))
+ return simplify_gen_subreg (DImode, x, V64BImode, 0);
+ else if (GET_CODE (x) == CONST_VECTOR)
+ {
+ HOST_WIDE_INT mask = 0;
+ for (int i = 0; i < 64; i++)
+ mask |= (INTVAL (CONST_VECTOR_ELT (x, i)) ? (HOST_WIDE_INT)1 << i : 0);
+
+ return gen_rtx_CONST_INT (VOIDmode, mask);
+ }
+ else if (MEM_P (x))
+ {
+ rtx copy = shallow_copy_rtx (x);
+ PUT_MODE (copy, DImode);
+ return copy;
+ }
+ else
+ {
+ gcc_unreachable ();
+ return x;
+ }
+}
+
+/* Return an RTX that references a vector with the i-th lane containing
+ PERM[i]*4.
+
+ Helper function for gcn_vectorize_vec_perm_const. */
+
+static rtx
+gcn_make_vec_perm_address (unsigned int *perm)
+{
+ rtx x = gen_reg_rtx (V64SImode);
+ emit_insn (gen_mov_with_exec (x, gcn_vec_constant (V64SImode, 0)));
+
+ /* Permutation addresses use byte addressing. With each vector lane being
+ 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
+ so only set those.
+
+ The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
+ select between lanes in two vectors, but as the DS_BPERMUTE* instructions
+ only take one source vector, the most-significant bit can be ignored
+ here. Instead, we can use EXEC masking to select the relevant part of
+ each source vector after they are permuted separately. */
+ uint64_t bit_mask = 1 << 2;
+ for (int i = 2; i < 8; i++, bit_mask <<= 1)
+ {
+ uint64_t exec_mask = 0;
+ uint64_t lane_mask = 1;
+ for (int j = 0; j < 64; j++, lane_mask <<= 1)
+ if ((perm[j] * 4) & bit_mask)
+ exec_mask |= lane_mask;
+
+ if (exec_mask)
+ emit_insn (gen_addv64si3_vector (x, x,
+ gcn_vec_constant (V64SImode,
+ bit_mask),
+ get_exec (exec_mask), x));
+ }
+
+ return x;
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
+
+ Return true if permutation with SEL is possible.
+
+ If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
+ permutations. */
+
+static bool
+gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
+ rtx src0, rtx src1,
+ const vec_perm_indices & sel)
+{
+ unsigned int nelt = GET_MODE_NUNITS (vmode);
+
+ gcc_assert (VECTOR_MODE_P (vmode));
+ gcc_assert (nelt <= 64);
+ gcc_assert (sel.length () == nelt);
+
+ if (vmode == V64BImode)
+ {
+ /* This isn't a true vector, it's a bitmask. */
+ return false;
+ }
+
+ if (!dst)
+ {
+ /* All vector permutations are possible on this architecture,
+ with varying degrees of efficiency depending on the permutation. */
+ return true;
+ }
+
+ unsigned int perm[64];
+ for (unsigned int i = 0; i < nelt; ++i)
+ perm[i] = sel[i] & (2 * nelt - 1);
+
+ /* Make life a bit easier by swapping operands if necessary so that
+ the first element always comes from src0. */
+ if (perm[0] >= nelt)
+ {
+ rtx temp = src0;
+ src0 = src1;
+ src1 = temp;
+
+ for (unsigned int i = 0; i < nelt; ++i)
+ if (perm[i] < nelt)
+ perm[i] += nelt;
+ else
+ perm[i] -= nelt;
+ }
+
+ /* TODO: There are more efficient ways to implement certain permutations
+ using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
+ this more inefficient generic approach is used. */
+
+ int64_t src1_lanes = 0;
+ int64_t lane_bit = 1;
+
+ for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
+ {
+ /* Set the bits for lanes from src1. */
+ if (perm[i] >= nelt)
+ src1_lanes |= lane_bit;
+ }
+
+ rtx addr = gcn_make_vec_perm_address (perm);
+ rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
+
+ switch (vmode)
+ {
+ case E_V64QImode:
+ ds_bpermute = gen_ds_bpermutev64qi;
+ break;
+ case E_V64HImode:
+ ds_bpermute = gen_ds_bpermutev64hi;
+ break;
+ case E_V64SImode:
+ ds_bpermute = gen_ds_bpermutev64si;
+ break;
+ case E_V64HFmode:
+ ds_bpermute = gen_ds_bpermutev64hf;
+ break;
+ case E_V64SFmode:
+ ds_bpermute = gen_ds_bpermutev64sf;
+ break;
+ case E_V64DImode:
+ ds_bpermute = gen_ds_bpermutev64di;
+ break;
+ case E_V64DFmode:
+ ds_bpermute = gen_ds_bpermutev64df;
+ break;
+ default:
+ gcc_assert (false);
+ }
+
+ /* Load elements from src0 to dst. */
+ gcc_assert (~src1_lanes);
+ emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
+
+ /* Load elements from src1 to dst. */
+ if (src1_lanes)
+ {
+ /* Masking a lane masks both the destination and source lanes for
+ DS_BPERMUTE, so we need to have all lanes enabled for the permute,
+ then add an extra masked move to merge the results of permuting
+ the two source vectors together.
+ */
+ rtx tmp = gen_reg_rtx (vmode);
+ emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
+ emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
+ }
+
+ return true;
+}
+
+/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
+
+ Return nonzero if vector MODE is supported with at least move
+ instructions. */
+
+static bool
+gcn_vector_mode_supported_p (machine_mode mode)
+{
+ /* FIXME: Enable V64QImode and V64HImode.
+ We should support these modes, but vector operations are usually
+ assumed to automatically truncate types, and GCN does not. We
+ need to add explicit truncates and/or use SDWA for QI/HI insns. */
+ return (/* mode == V64QImode || mode == V64HImode
+ ||*/ mode == V64SImode || mode == V64DImode
+ || mode == V64SFmode || mode == V64DFmode
+ /* For the mask mode only. */
+ || mode == V64BImode);
+}
+
+/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
+
+ Enables autovectorization for all supported modes. */
+
+static machine_mode
+gcn_vectorize_preferred_simd_mode (scalar_mode mode)
+{
+ switch (mode)
+ {
+ case E_QImode:
+ return V64QImode;
+ case E_HImode:
+ return V64HImode;
+ case E_SImode:
+ return V64SImode;
+ case E_DImode:
+ return V64DImode;
+ case E_SFmode:
+ return V64SFmode;
+ case E_DFmode:
+ return V64DFmode;
+ default:
+ return word_mode;
+ }
+}
+
+/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
+
+ Return true if the target supports misaligned vector store/load of a
+ specific factor denoted in the misalignment parameter. */
+
+static bool
+gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
+ const_tree type, int misalignment,
+ bool is_packed)
+{
+ if (is_packed)
+ return false;
+
+ /* If the misalignment is unknown, we should be able to handle the access
+ so long as it is not to a member of a packed data structure. */
+ if (misalignment == -1)
+ return true;
+
+ /* Return true if the misalignment is a multiple of the natural alignment
+ of the vector's element type. This is probably always going to be
+ true in practice, since we've already established that this isn't a
+ packed access. */
+ return misalignment % TYPE_ALIGN_UNIT (type) == 0;
+}
+
+/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
+
+ Return true if vector alignment is reachable (by peeling N iterations) for
+ the given scalar type TYPE. */
+
+static bool
+gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
+{
+ /* Vectors which aren't in packed structures will not be less aligned than
+ the natural alignment of their element type, so this is safe. */
+ return !is_packed;
+}
+
+/* Generate DPP instructions used for vector reductions.
+
+ The opcode is given by INSN.
+ The first operand of the operation is shifted right by SHIFT vector lanes.
+ SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
+ broadcast the next row (thereby acting like a shift of 16 for the end of
+ each row). If SHIFT is 32, lane 31 is broadcast to all the
+ following lanes (thereby acting like a shift of 32 for lane 63). */
+
+char *
+gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
+ int unspec, int shift)
+{
+ static char buf[64];
+ const char *dpp;
+ const char *vcc_in = "";
+ const char *vcc_out = "";
+
+ /* Add the vcc operand if needed. */
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
+ vcc_in = ", vcc";
+
+ if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
+ || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
+ vcc_out = ", vcc";
+ }
+
+ /* Add the DPP modifiers. */
+ switch (shift)
+ {
+ case 1:
+ dpp = "row_shr:1 bound_ctrl:0";
+ break;
+ case 2:
+ dpp = "row_shr:2 bound_ctrl:0";
+ break;
+ case 4:
+ dpp = "row_shr:4 bank_mask:0xe";
+ break;
+ case 8:
+ dpp = "row_shr:8 bank_mask:0xc";
+ break;
+ case 16:
+ dpp = "row_bcast:15 row_mask:0xa";
+ break;
+ case 32:
+ dpp = "row_bcast:31 row_mask:0xc";
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
+
+ return buf;
+}
+
+/* Generate vector reductions in terms of DPP instructions.
+
+ The vector register SRC of mode MODE is reduced using the operation given
+ by UNSPEC, and the scalar result is returned in lane 63 of a vector
+ register. */
+
+rtx
+gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
+{
+ rtx tmp = gen_reg_rtx (mode);
+ bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
+ && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && (TARGET_GCN3 || mode == V64DImode);
+
+ if (use_plus_carry)
+ unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
+
+ /* Perform reduction by first performing the reduction operation on every
+ pair of lanes, then on every pair of results from the previous
+ iteration (thereby effectively reducing every 4 lanes) and so on until
+ all lanes are reduced. */
+ for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
+ {
+ rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
+ rtx insn = gen_rtx_SET (tmp,
+ gen_rtx_UNSPEC (mode,
+ gen_rtvec (3,
+ src, src, shift_val),
+ unspec));
+
+ /* Add clobber for instructions that set the carry flags. */
+ if (use_plus_carry)
+ {
+ rtx clobber = gen_rtx_CLOBBER (VOIDmode,
+ gen_rtx_REG (DImode, VCC_REG));
+ insn = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (2, insn, clobber));
+ }
+
+ emit_insn (insn);
+
+ /* The source operands for every iteration after the first
+ should be TMP. */
+ src = tmp;
+ }
+
+ return tmp;
+}
+
+/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
+
+int
+gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
+ tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
+{
+ /* Always vectorize. */
+ return 1;
+}
+
+/* }}} */
+/* {{{ md_reorg pass. */
+
+/* Identify VMEM instructions from their "type" attribute. */
+
+static bool
+gcn_vmem_insn_p (attr_type type)
+{
+ switch (type)
+ {
+ case TYPE_MUBUF:
+ case TYPE_MTBUF:
+ case TYPE_FLAT:
+ return true;
+ case TYPE_UNKNOWN:
+ case TYPE_SOP1:
+ case TYPE_SOP2:
+ case TYPE_SOPK:
+ case TYPE_SOPC:
+ case TYPE_SOPP:
+ case TYPE_SMEM:
+ case TYPE_DS:
+ case TYPE_VOP2:
+ case TYPE_VOP1:
+ case TYPE_VOPC:
+ case TYPE_VOP3A:
+ case TYPE_VOP3B:
+ case TYPE_VOP_SDWA:
+ case TYPE_VOP_DPP:
+ case TYPE_MULT:
+ case TYPE_VMULT:
+ return false;
+ }
+ gcc_unreachable ();
+ return false;
+}
+
+/* If INSN sets the EXEC register to a constant value, return the value,
+ otherwise return zero. */
+
+static
+int64_t gcn_insn_exec_value (rtx_insn *insn)
+{
+ if (!NONDEBUG_INSN_P (insn))
+ return 0;
+
+ rtx pattern = PATTERN (insn);
+
+ if (GET_CODE (pattern) == SET)
+ {
+ rtx dest = XEXP (pattern, 0);
+ rtx src = XEXP (pattern, 1);
+
+ if (GET_MODE (dest) == DImode
+ && REG_P (dest) && REGNO (dest) == EXEC_REG
+ && CONST_INT_P (src))
+ return INTVAL (src);
+ }
+
+ return 0;
+}
+
+/* Sets the EXEC register before INSN to the value that it had after
+ LAST_EXEC_DEF. The constant value of the EXEC register is returned if
+ known, otherwise it returns zero. */
+
+static
+int64_t gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def,
+ int64_t curr_exec, bool curr_exec_known,
+ bool &last_exec_def_saved)
+{
+ rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
+ rtx exec;
+
+ int64_t exec_value = gcn_insn_exec_value (last_exec_def);
+
+ if (exec_value)
+ {
+ /* If the EXEC value is a constant and it happens to be the same as the
+ current EXEC value, the restore can be skipped. */
+ if (curr_exec_known && exec_value == curr_exec)
+ return exec_value;
+
+ exec = GEN_INT (exec_value);
+ }
+ else
+ {
+ /* If the EXEC value is not a constant, save it in a register after the
+ point of definition. */
+ rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
+
+ if (!last_exec_def_saved)
+ {
+ start_sequence ();
+ emit_insn (gen_move_insn (exec_save_reg, exec_reg));
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+
+ emit_insn_after (seq, last_exec_def);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Saving EXEC after insn %d.\n",
+ INSN_UID (last_exec_def));
+
+ last_exec_def_saved = true;
+ }
+
+ exec = exec_save_reg;
+ }
+
+ /* Restore EXEC register before the usage. */
+ start_sequence ();
+ emit_insn (gen_move_insn (exec_reg, exec));
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+ emit_insn_before (seq, insn);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ if (exec_value)
+ fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
+ exec_value, INSN_UID (insn));
+ else
+ fprintf (dump_file,
+ "Restoring EXEC from saved value before insn %d.\n",
+ INSN_UID (insn));
+ }
+
+ return exec_value;
+}
+
+/* Implement TARGET_MACHINE_DEPENDENT_REORG.
+
+ Ensure that pipeline dependencies and lane masking are set correctly. */
+
+static void
+gcn_md_reorg (void)
+{
+ basic_block bb;
+ rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
+ rtx exec_lo_reg = gen_rtx_REG (SImode, EXEC_LO_REG);
+ rtx exec_hi_reg = gen_rtx_REG (SImode, EXEC_HI_REG);
+ regset_head live;
+
+ INIT_REG_SET (&live);
+
+ compute_bb_for_insn ();
+
+ if (!optimize)
+ {
+ split_all_insns ();
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "After split:\n");
+ print_rtl_with_bb (dump_file, get_insns (), dump_flags);
+ }
+
+ /* Update data-flow information for split instructions. */
+ df_insn_rescan_all ();
+ }
+
+ df_analyze ();
+
+ /* This pass ensures that the EXEC register is set correctly, according
+ to the "exec" attribute. However, care must be taken so that the
+ value that reaches explicit uses of the EXEC register remains the
+ same as before.
+ */
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "BB %d:\n", bb->index);
+
+ rtx_insn *insn, *curr;
+ rtx_insn *last_exec_def = BB_HEAD (bb);
+ bool last_exec_def_saved = false;
+ bool curr_exec_explicit = true;
+ bool curr_exec_known = true;
+ int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
+ after last_exec_def is executed'. */
+
+ FOR_BB_INSNS_SAFE (bb, insn, curr)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ if (GET_CODE (PATTERN (insn)) == USE
+ || GET_CODE (PATTERN (insn)) == CLOBBER)
+ continue;
+
+ /* Check the instruction for implicit setting of EXEC via an
+ attribute. */
+ attr_exec exec_attr = get_attr_exec (insn);
+ int64_t new_exec;
+
+ switch (exec_attr)
+ {
+ case EXEC_SINGLE:
+ /* Instructions that do not involve memory accesses only require
+ bit 0 of EXEC to be set. */
+ if (gcn_vmem_insn_p (get_attr_type (insn))
+ || get_attr_type (insn) == TYPE_DS)
+ new_exec = 1;
+ else
+ new_exec = curr_exec | 1;
+ break;
+
+ case EXEC_FULL:
+ new_exec = -1;
+ break;
+
+ default:
+ new_exec = 0;
+ break;
+ }
+
+ if (new_exec && (!curr_exec_known || new_exec != curr_exec))
+ {
+ start_sequence ();
+ emit_insn (gen_move_insn (exec_reg, GEN_INT (new_exec)));
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+ emit_insn_before (seq, insn);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
+ new_exec, INSN_UID (insn));
+
+ curr_exec = new_exec;
+ curr_exec_explicit = false;
+ curr_exec_known = true;
+ }
+
+ /* The state of the EXEC register is unknown after a
+ function call. */
+ if (CALL_P (insn))
+ curr_exec_known = false;
+
+ bool exec_lo_def_p = reg_set_p (exec_lo_reg, PATTERN (insn));
+ bool exec_hi_def_p = reg_set_p (exec_hi_reg, PATTERN (insn));
+ bool exec_used = reg_referenced_p (exec_reg, PATTERN (insn));
+
+ /* Handle explicit uses of EXEC. If the instruction is a partial
+ explicit definition of EXEC, then treat it as an explicit use of
+ EXEC as well. */
+ if (exec_used || exec_lo_def_p != exec_hi_def_p)
+ {
+ /* An instruction that explicitly uses EXEC should not also
+ implicitly define it. */
+ gcc_assert (!exec_used || !new_exec);
+
+ if (!curr_exec_known || !curr_exec_explicit)
+ {
+ /* Restore the previous explicitly defined value. */
+ curr_exec = gcn_restore_exec (insn, last_exec_def,
+ curr_exec, curr_exec_known,
+ last_exec_def_saved);
+ curr_exec_explicit = true;
+ curr_exec_known = true;
+ }
+ }
+
+ /* Handle explicit definitions of EXEC. */
+ if (exec_lo_def_p || exec_hi_def_p)
+ {
+ last_exec_def = insn;
+ last_exec_def_saved = false;
+ curr_exec = gcn_insn_exec_value (insn);
+ curr_exec_explicit = true;
+ curr_exec_known = true;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Found %s definition of EXEC at insn %d.\n",
+ exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
+ INSN_UID (insn));
+ }
+ }
+
+ COPY_REG_SET (&live, DF_LR_OUT (bb));
+ df_simulate_initialize_backwards (bb, &live);
+
+ /* If EXEC is live after the basic block, restore the value of EXEC
+ at the end of the block. */
+ if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
+ || REGNO_REG_SET_P (&live, EXEC_HI_REG))
+ && (!curr_exec_known || !curr_exec_explicit))
+ {
+ rtx_insn *end_insn = BB_END (bb);
+
+ /* If the instruction is not a jump instruction, do the restore
+ after the last instruction in the basic block. */
+ if (NONJUMP_INSN_P (end_insn))
+ end_insn = NEXT_INSN (end_insn);
+
+ gcn_restore_exec (end_insn, last_exec_def, curr_exec,
+ curr_exec_known, last_exec_def_saved);
+ }
+ }
+
+ CLEAR_REG_SET (&live);
+
+ /* "Manually Inserted Wait States (NOPs)."
+
+ GCN hardware detects most kinds of register dependencies, but there
+ are some exceptions documented in the ISA manual. This pass
+ detects the missed cases, and inserts the documented number of NOPs
+ required for correct execution. */
+
+ const int max_waits = 5;
+ struct ilist
+ {
+ rtx_insn *insn;
+ attr_unit unit;
+ HARD_REG_SET writes;
+ int age;
+ } back[max_waits];
+ int oldest = 0;
+ for (int i = 0; i < max_waits; i++)
+ back[i].insn = NULL;
+
+ rtx_insn *insn, *last_insn = NULL;
+ for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ if (GET_CODE (PATTERN (insn)) == USE
+ || GET_CODE (PATTERN (insn)) == CLOBBER)
+ continue;
+
+ attr_type itype = get_attr_type (insn);
+ attr_unit iunit = get_attr_unit (insn);
+ HARD_REG_SET ireads, iwrites;
+ CLEAR_HARD_REG_SET (ireads);
+ CLEAR_HARD_REG_SET (iwrites);
+ note_stores (PATTERN (insn), record_hard_reg_sets, &iwrites);
+ note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
+
+ /* Scan recent previous instructions for dependencies not handled in
+ hardware. */
+ int nops_rqd = 0;
+ for (int i = oldest; i < oldest + max_waits; i++)
+ {
+ struct ilist *prev_insn = &back[i % max_waits];
+
+ if (!prev_insn->insn)
+ continue;
+
+ /* VALU writes SGPR followed by VMEM reading the same SGPR
+ requires 5 wait states. */
+ if ((prev_insn->age + nops_rqd) < 5
+ && prev_insn->unit == UNIT_VECTOR
+ && gcn_vmem_insn_p (itype))
+ {
+ HARD_REG_SET regs;
+ COPY_HARD_REG_SET (regs, prev_insn->writes);
+ AND_HARD_REG_SET (regs, ireads);
+ if (hard_reg_set_intersect_p
+ (regs, reg_class_contents[(int) SGPR_REGS]))
+ nops_rqd = 5 - prev_insn->age;
+ }
+
+ /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
+ requires 5 wait states. */
+ if ((prev_insn->age + nops_rqd) < 5
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && ((hard_reg_set_intersect_p
+ (prev_insn->writes,
+ reg_class_contents[(int) EXEC_MASK_REG])
+ && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
+ ||
+ (hard_reg_set_intersect_p
+ (prev_insn->writes,
+ reg_class_contents[(int) VCC_CONDITIONAL_REG])
+ && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
+ nops_rqd = 5 - prev_insn->age;
+
+ /* VALU writes SGPR/VCC followed by v_{read,write}lane using
+ SGPR/VCC as lane select requires 4 wait states. */
+ if ((prev_insn->age + nops_rqd) < 4
+ && prev_insn->unit == UNIT_VECTOR
+ && get_attr_laneselect (insn) == LANESELECT_YES)
+ {
+ HARD_REG_SET regs;
+ COPY_HARD_REG_SET (regs, prev_insn->writes);
+ AND_HARD_REG_SET (regs, ireads);
+ if (hard_reg_set_intersect_p
+ (regs, reg_class_contents[(int) SGPR_REGS])
+ || hard_reg_set_intersect_p
+ (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
+ nops_rqd = 4 - prev_insn->age;
+ }
+
+ /* VALU writes VGPR followed by VALU_DPP reading that VGPR
+ requires 2 wait states. */
+ if ((prev_insn->age + nops_rqd) < 2
+ && prev_insn->unit == UNIT_VECTOR
+ && itype == TYPE_VOP_DPP)
+ {
+ HARD_REG_SET regs;
+ COPY_HARD_REG_SET (regs, prev_insn->writes);
+ AND_HARD_REG_SET (regs, ireads);
+ if (hard_reg_set_intersect_p
+ (regs, reg_class_contents[(int) VGPR_REGS]))
+ nops_rqd = 2 - prev_insn->age;
+ }
+ }
+
+ /* Insert the required number of NOPs. */
+ for (int i = nops_rqd; i > 0; i--)
+ emit_insn_after (gen_nop (), last_insn);
+
+ /* Age the previous instructions. We can also ignore writes to
+ registers subsequently overwritten. */
+ HARD_REG_SET written;
+ CLEAR_HARD_REG_SET (written);
+ for (int i = oldest + max_waits - 1; i > oldest; i--)
+ {
+ struct ilist *prev_insn = &back[i % max_waits];
+
+ /* Assume all instructions are equivalent to one "wait", the same
+ as s_nop. This is probably true for SALU, but not VALU (which
+ may take longer), so this is not optimal. However, AMD do
+ not publish the cycle times for instructions. */
+ prev_insn->age += 1 + nops_rqd;
+
+ IOR_HARD_REG_SET (written, iwrites);
+ AND_COMPL_HARD_REG_SET (prev_insn->writes, written);
+ }
+
+ /* Track the current instruction as a previous instruction. */
+ back[oldest].insn = insn;
+ back[oldest].unit = iunit;
+ COPY_HARD_REG_SET (back[oldest].writes, iwrites);
+ back[oldest].age = 0;
+ oldest = (oldest + 1) % max_waits;
+
+ last_insn = insn;
+ }
+}
+
+/* }}} */
+/* {{{ OpenACC / OpenMP. */
+
+#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
+#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
+#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
+
+/* Implement TARGET_GOACC_VALIDATE_DIMS.
+
+ Check the launch dimensions provided for an OpenACC compute
+ region, or routine. */
+
+static bool
+gcn_goacc_validate_dims (tree decl, int dims[], int fn_level)
+{
+ bool changed = false;
+
+ /* FIXME: remove -facc-experimental-workers when they're ready. */
+ int max_workers = flag_worker_partitioning ? 4 : 1;
+
+ /* The vector size must appear to be 64, to the user, unless this is a
+ SEQ routine. The real, internal value is always 1, which means use
+ autovectorization, but the user should not see that. */
+ if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
+ && dims[GOMP_DIM_VECTOR] >= 0)
+ {
+ if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
+ && dims[GOMP_DIM_VECTOR] != 64)
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
+ OPT_Wopenacc_dims,
+ (dims[GOMP_DIM_VECTOR]
+ ? "using vector_length (64), ignoring %d"
+ : "using vector_length (64), ignoring runtime setting"),
+ dims[GOMP_DIM_VECTOR]);
+ dims[GOMP_DIM_VECTOR] = 1;
+ changed = true;
+ }
+
+ /* Check the num workers is not too large. */
+ if (dims[GOMP_DIM_WORKER] > max_workers)
+ {
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
+ OPT_Wopenacc_dims,
+ "using num_workers (%d), ignoring %d",
+ max_workers, dims[GOMP_DIM_WORKER]);
+ dims[GOMP_DIM_WORKER] = max_workers;
+ changed = true;
+ }
+
+ /* Set global defaults. */
+ if (!decl)
+ {
+ dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
+ if (dims[GOMP_DIM_WORKER] < 0)
+ dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
+ ? GCN_DEFAULT_WORKERS : 1);
+ if (dims[GOMP_DIM_GANG] < 0)
+ dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
+ changed = true;
+ }
+
+ return changed;
+}
+
+/* Helper function for oacc_dim_size instruction.
+ Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
+
+rtx
+gcn_oacc_dim_size (int dim)
+{
+ if (dim < 0 || dim > 2)
+ error ("offload dimension out of range (%d)", dim);
+
+ /* Vectors are a special case. */
+ if (dim == 2)
+ return const1_rtx; /* Think of this as 1 times 64. */
+
+ static int offset[] = {
+ /* Offsets into dispatch packet. */
+ 12, /* X dim = Gang / Team / Work-group. */
+ 20, /* Z dim = Worker / Thread / Wavefront. */
+ 16 /* Y dim = Vector / SIMD / Work-item. */
+ };
+ rtx addr = gen_rtx_PLUS (DImode,
+ gen_rtx_REG (DImode,
+ cfun->machine->args.
+ reg[DISPATCH_PTR_ARG]),
+ GEN_INT (offset[dim]));
+ return gen_rtx_MEM (SImode, addr);
+}
+
+/* Helper function for oacc_dim_pos instruction.
+ Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
+
+rtx
+gcn_oacc_dim_pos (int dim)
+{
+ if (dim < 0 || dim > 2)
+ error ("offload dimension out of range (%d)", dim);
+
+ static const int reg[] = {
+ WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
+ WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
+ WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
+ };
+
+ int reg_num = cfun->machine->args.reg[reg[dim]];
+
+ /* The information must have been requested by the kernel. */
+ gcc_assert (reg_num >= 0);
+
+ return gen_rtx_REG (SImode, reg_num);
+}
+
+/* Implement TARGET_GOACC_FORK_JOIN. */
+
+static bool
+gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
+ bool ARG_UNUSED (is_fork))
+{
+ /* GCN does not use the fork/join concept invented for NVPTX.
+ Instead we use standard autovectorization. */
+ return false;
+}
+
+/* Implement ???????
+ FIXME make this a real hook.
+
+ Adjust FNDECL such that options inherited from the host compiler
+ are made appropriate for the accelerator compiler. */
+
+void
+gcn_fixup_accel_lto_options (tree fndecl)
+{
+ tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
+ if (!func_optimize)
+ return;
+
+ tree old_optimize = build_optimization_node (&global_options);
+ tree new_optimize;
+
+ /* If the function changed the optimization levels as well as
+ setting target options, start with the optimizations
+ specified. */
+ if (func_optimize != old_optimize)
+ cl_optimization_restore (&global_options,
+ TREE_OPTIMIZATION (func_optimize));
+
+ gcn_option_override ();
+
+ /* The target attributes may also change some optimization flags,
+ so update the optimization options if necessary. */
+ new_optimize = build_optimization_node (&global_options);
+
+ if (old_optimize != new_optimize)
+ {
+ DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
+ cl_optimization_restore (&global_options,
+ TREE_OPTIMIZATION (old_optimize));
+ }
+}
+
+/* }}} */
+/* {{{ ASM Output. */
+
+/* Implement TARGET_ASM_FILE_START.
+
+ Print assembler file header text. */
+
+static void
+output_file_start (void)
+{
+ fprintf (asm_out_file, "\t.text\n");
+ fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
+ fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
+ fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
+ fprintf (asm_out_file, "\t.text\n");
+}
+
+/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
+
+ Print the initial definition of a function name.
+
+ For GCN kernel entry points this includes all the HSA meta-data, special
+ alignment constraints that don't apply to regular functions, and magic
+ comments that pass information to mkoffload. */
+
+void
+gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
+{
+ int sgpr, vgpr;
+ bool xnack_enabled = false;
+ int extra_regs = 0;
+
+ if (cfun && cfun->machine && cfun->machine->normal_function)
+ {
+ fputs ("\t.type\t", file);
+ assemble_name (file, name);
+ fputs (",@function\n", file);
+ assemble_name (file, name);
+ fputs (":\n", file);
+ return;
+ }
+
+ if (!leaf_function_p ())
+ {
+ /* We can know how many registers function calls might use. */
+ /* FIXME: restrict normal functions to a smaller set that allows
+ more optimal use of wavefronts. */
+ vgpr = 256;
+ sgpr = 102;
+ extra_regs = 0;
+ }
+ else
+ {
+ /* Determine count of sgpr/vgpr registers by looking for last
+ one used. */
+ for (sgpr = 101; sgpr >= 0; sgpr--)
+ if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
+ break;
+ sgpr++;
+ for (vgpr = 255; vgpr >= 0; vgpr--)
+ if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
+ break;
+ vgpr++;
+
+ if (xnack_enabled)
+ extra_regs = 6;
+ if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
+ || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
+ extra_regs = 4;
+ else if (df_regs_ever_live_p (VCC_LO_REG)
+ || df_regs_ever_live_p (VCC_HI_REG))
+ extra_regs = 2;
+ }
+
+ fputs ("\t.align\t256\n", file);
+ fputs ("\t.type\t", file);
+ assemble_name (file, name);
+ fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
+ assemble_name (file, name);
+ fputs ("\n", file);
+ assemble_name (file, name);
+ fputs (":\n", file);
+ fprintf (file, "\t.amd_kernel_code_t\n"
+ "\t\tkernel_code_version_major = 1\n"
+ "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
+ /* "\t\tmachine_version_major = 8\n"
+ "\t\tmachine_version_minor = 0\n"
+ "\t\tmachine_version_stepping = 1\n" */
+ "\t\tkernel_code_entry_byte_offset = 256\n"
+ "\t\tkernel_code_prefetch_byte_size = 0\n"
+ "\t\tmax_scratch_backing_memory_byte_size = 0\n"
+ "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
+ "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
+ "\t\tcompute_pgm_rsrc1_priority = 0\n"
+ "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
+ "\t\tcompute_pgm_rsrc1_priv = 0\n"
+ "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
+ "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
+ "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
+ /* We enable scratch memory. */
+ "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
+ "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
+ "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
+ "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
+ "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
+ "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
+ "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
+ "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
+ "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /*FIXME */
+ "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
+ (vgpr - 1) / 4,
+ /* Must match wavefront_sgpr_count */
+ (sgpr + extra_regs - 1) / 8,
+ /* The total number of SGPR user data registers requested. This
+ number must match the number of user data registers enabled. */
+ cfun->machine->args.nsgprs);
+ int reg = FIRST_SGPR_REG;
+ for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
+ {
+ int reg_first = -1;
+ int reg_last;
+ if ((cfun->machine->args.requested & (1 << a))
+ && (gcn_kernel_arg_types[a].fixed_regno < 0))
+ {
+ reg_first = reg;
+ reg_last = (reg_first
+ + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
+ / UNITS_PER_WORD) - 1);
+ reg = reg_last + 1;
+ }
+
+ if (gcn_kernel_arg_types[a].header_pseudo)
+ {
+ fprintf (file, "\t\t%s = %i",
+ gcn_kernel_arg_types[a].header_pseudo,
+ (cfun->machine->args.requested & (1 << a)) != 0);
+ if (reg_first != -1)
+ {
+ fprintf (file, " ; (");
+ for (int i = reg_first; i <= reg_last; ++i)
+ {
+ if (i != reg_first)
+ fprintf (file, ", ");
+ fprintf (file, "%s", reg_names[i]);
+ }
+ fprintf (file, ")");
+ }
+ fprintf (file, "\n");
+ }
+ else if (gcn_kernel_arg_types[a].fixed_regno >= 0
+ && cfun->machine->args.requested & (1 << a))
+ fprintf (file, "\t\t; %s = %i (%s)\n",
+ gcn_kernel_arg_types[a].name,
+ (cfun->machine->args.requested & (1 << a)) != 0,
+ reg_names[gcn_kernel_arg_types[a].fixed_regno]);
+ }
+ fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
+ (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
+ ? 2
+ : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
+ ? 1 : 0);
+ fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
+ "\t\tprivate_element_size = 1\n"
+ "\t\tis_ptr64 = 1\n"
+ "\t\tis_dynamic_callstack = 0\n"
+ "\t\tis_debug_enabled = 0\n"
+ "\t\tis_xnack_enabled = %i\n"
+ "\t\tworkitem_private_segment_byte_size = %i\n"
+ "\t\tworkgroup_group_segment_byte_size = %u\n"
+ "\t\tgds_segment_byte_size = 0\n"
+ "\t\tkernarg_segment_byte_size = %i\n"
+ "\t\tworkgroup_fbarrier_count = 0\n"
+ "\t\twavefront_sgpr_count = %i\n"
+ "\t\tworkitem_vgpr_count = %i\n"
+ "\t\treserved_vgpr_first = 0\n"
+ "\t\treserved_vgpr_count = 0\n"
+ "\t\treserved_sgpr_first = 0\n"
+ "\t\treserved_sgpr_count = 0\n"
+ "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
+ "\t\tdebug_private_segment_buffer_sgpr = 0\n"
+ "\t\tkernarg_segment_alignment = %i\n"
+ "\t\tgroup_segment_alignment = 4\n"
+ "\t\tprivate_segment_alignment = %i\n"
+ "\t\twavefront_size = 6\n"
+ "\t\tcall_convention = 0\n"
+ "\t\truntime_loader_kernel_symbol = 0\n"
+ "\t.end_amd_kernel_code_t\n", xnack_enabled,
+ /* workitem_private_segment_bytes_size needs to be
+ one 64th the wave-front stack size. */
+ stack_size_opt / 64,
+ LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
+ /* Number of scalar registers used by a wavefront. This
+ includes the special SGPRs for VCC, Flat Scratch (Base,
+ Size) and XNACK (for GFX8 (VI)+). It does not include the
+ 16 SGPR added if a trap handler is enabled. Must match
+ compute_pgm_rsrc1.sgprs. */
+ sgpr + extra_regs, vgpr,
+ cfun->machine->kernarg_segment_alignment,
+ crtl->stack_alignment_needed / 8);
+
+ /* This comment is read by mkoffload. */
+ if (flag_openacc)
+ fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
+ oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
+ oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
+ oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
+}
+
+/* Implement TARGET_ASM_SELECT_SECTION.
+
+ Return the section into which EXP should be placed. */
+
+static section *
+gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
+{
+ if (TREE_TYPE (exp) != error_mark_node
+ && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
+ {
+ if (!DECL_P (exp))
+ return get_section (".lds_bss",
+ SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
+ NULL);
+
+ return get_named_section (exp, ".lds_bss", reloc);
+ }
+
+ return default_elf_select_section (exp, reloc, align);
+}
+
+/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
+
+ Emits custom text into the assembler file at the head of each function. */
+
+static void
+gcn_target_asm_function_prologue (FILE *file)
+{
+ machine_function *offsets = gcn_compute_frame_offsets ();
+
+ asm_fprintf (file, "\t; using %s addressing in function\n",
+ offsets->use_flat_addressing ? "flat" : "global");
+
+ if (offsets->normal_function)
+ {
+ asm_fprintf (file, "\t; frame pointer needed: %s\n",
+ offsets->need_frame_pointer ? "true" : "false");
+ asm_fprintf (file, "\t; lr needs saving: %s\n",
+ offsets->lr_needs_saving ? "true" : "false");
+ asm_fprintf (file, "\t; outgoing args size: %wd\n",
+ offsets->outgoing_args_size);
+ asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
+ asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
+ asm_fprintf (file, "\t; callee save size: %wd\n",
+ offsets->callee_saves);
+ }
+ else
+ {
+ asm_fprintf (file, "\t; HSA kernel entry point\n");
+ asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
+ asm_fprintf (file, "\t; outgoing args size: %wd\n",
+ offsets->outgoing_args_size);
+
+ /* Enable denorms. */
+ asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
+ " input and output denorms\n");
+ asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
+ }
+}
+
+/* Helper function for print_operand and print_operand_address.
+
+ Print a register as the assembler requires, according to mode and name. */
+
+static void
+print_reg (FILE *file, rtx x)
+{
+ machine_mode mode = GET_MODE (x);
+ if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
+ || mode == HFmode || mode == SFmode
+ || mode == V64SFmode || mode == V64SImode
+ || mode == V64QImode || mode == V64HImode)
+ fprintf (file, "%s", reg_names[REGNO (x)]);
+ else if (mode == DImode || mode == V64DImode
+ || mode == DFmode || mode == V64DFmode)
+ {
+ if (SGPR_REGNO_P (REGNO (x)))
+ fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
+ REGNO (x) - FIRST_SGPR_REG + 1);
+ else if (VGPR_REGNO_P (REGNO (x)))
+ fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
+ REGNO (x) - FIRST_VGPR_REG + 1);
+ else if (REGNO (x) == FLAT_SCRATCH_REG)
+ fprintf (file, "flat_scratch");
+ else if (REGNO (x) == EXEC_REG)
+ fprintf (file, "exec");
+ else if (REGNO (x) == VCC_LO_REG)
+ fprintf (file, "vcc");
+ else
+ fprintf (file, "[%s:%s]",
+ reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
+ }
+ else if (mode == TImode)
+ {
+ if (SGPR_REGNO_P (REGNO (x)))
+ fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
+ REGNO (x) - FIRST_SGPR_REG + 3);
+ else if (VGPR_REGNO_P (REGNO (x)))
+ fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
+ REGNO (x) - FIRST_VGPR_REG + 3);
+ else
+ gcc_unreachable ();
+ }
+ else
+ gcc_unreachable ();
+}
+
+/* Implement TARGET_SECTION_TYPE_FLAGS.
+
+ Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
+
+static unsigned int
+gcn_section_type_flags (tree decl, const char *name, int reloc)
+{
+ if (strcmp (name, ".lds_bss") == 0)
+ return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
+
+ return default_section_type_flags (decl, name, reloc);
+}
+
+/* Helper function for gcn_asm_output_symbol_ref.
+
+ FIXME: If we want to have propagation blocks allocated separately and
+ statically like this, it would be better done via symbol refs and the
+ assembler/linker. This is a temporary hack. */
+
+static void
+gcn_print_lds_decl (FILE *f, tree var)
+{
+ int *offset;
+ machine_function *machfun = cfun->machine;
+
+ if ((offset = machfun->lds_allocs->get (var)))
+ fprintf (f, "%u", (unsigned) *offset);
+ else
+ {
+ unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
+ tree type = TREE_TYPE (var);
+ unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
+ if (size > align && size > 4 && align < 8)
+ align = 8;
+
+ machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
+ & ~(align - 1));
+
+ machfun->lds_allocs->put (var, machfun->lds_allocated);
+ fprintf (f, "%u", machfun->lds_allocated);
+ machfun->lds_allocated += size;
+ if (machfun->lds_allocated > LDS_SIZE)
+ error ("local data-share memory exhausted");
+ }
+}
+
+/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
+
+void
+gcn_asm_output_symbol_ref (FILE *file, rtx x)
+{
+ tree decl;
+ if ((decl = SYMBOL_REF_DECL (x)) != 0
+ && TREE_CODE (decl) == VAR_DECL
+ && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
+ {
+ /* LDS symbols (emitted using this hook) are only used at present
+ to propagate worker values from an active thread to neutered
+ threads. Use the same offset for each such block, but don't
+ use zero because null pointers are used to identify the active
+ thread in GOACC_single_copy_start calls. */
+ gcn_print_lds_decl (file, decl);
+ }
+ else
+ {
+ assemble_name (file, XSTR (x, 0));
+ /* FIXME: See above -- this condition is unreachable. */
+ if ((decl = SYMBOL_REF_DECL (x)) != 0
+ && TREE_CODE (decl) == VAR_DECL
+ && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
+ fputs ("@abs32", file);
+ }
+}
+
+/* Implement TARGET_CONSTANT_ALIGNMENT.
+
+ Returns the alignment in bits of a constant that is being placed in memory.
+ CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
+ would ordinarily have. */
+
+static HOST_WIDE_INT
+gcn_constant_alignment (const_tree ARG_UNUSED (constant),
+ HOST_WIDE_INT basic_align)
+{
+ return basic_align > 128 ? basic_align : 128;
+}
+
+/* Implement TARGET_VECTOR_ALIGNMENT.
+
+ The alignment returned by this hook must be a power-of-two multiple of the
+ default alignment of the vector element type. */
+
+static HOST_WIDE_INT
+gcn_vector_alignment (const_tree type)
+{
+ /* V64BImode is a special case because it gets converted to DImode. This
+ definition needs to not trip asserts within build_truth_vector_type. */
+ if (TYPE_MODE (type) == V64BImode)
+ return 64;
+
+ HOST_WIDE_INT vec_align = tree_to_shwi (TYPE_SIZE (type));
+ HOST_WIDE_INT elem_align = tree_to_shwi (TYPE_SIZE (TREE_TYPE (type)));
+ HOST_WIDE_INT align = vec_align;
+
+ /* Use the size (natural alignment) of the element type if we have a
+ 64-element vector. At present, smaller vectors will most likely use
+ scalar (load/store) instructions. This definition will probably need
+ attention if support is added for fewer-element vectors in vector
+ regs. */
+ if (TYPE_VECTOR_SUBPARTS (type) == 64)
+ align = elem_align;
+
+ return (align > 64) ? 64 : align;
+}
+
+/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
+
+void
+print_operand_address (FILE *file, rtx mem)
+{
+ gcc_assert (MEM_P (mem));
+
+ rtx reg;
+ rtx offset;
+ addr_space_t as = MEM_ADDR_SPACE (mem);
+ rtx addr = XEXP (mem, 0);
+ gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
+
+ if (AS_SCRATCH_P (as))
+ switch (GET_CODE (addr))
+ {
+ case REG:
+ print_reg (file, addr);
+ break;
+
+ case PLUS:
+ reg = XEXP (addr, 0);
+ offset = XEXP (addr, 1);
+ print_reg (file, reg);
+ if (GET_CODE (offset) == CONST_INT)
+ fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+ else
+ abort ();
+ break;
+
+ default:
+ debug_rtx (addr);
+ abort ();
+ }
+ else if (AS_ANY_FLAT_P (as))
+ {
+ if (GET_CODE (addr) == REG)
+ print_reg (file, addr);
+ else
+ {
+ gcc_assert (TARGET_GCN5_PLUS);
+ print_reg (file, XEXP (addr, 0));
+ }
+ }
+ else if (AS_GLOBAL_P (as))
+ {
+ gcc_assert (TARGET_GCN5_PLUS);
+
+ rtx base = addr;
+ rtx vgpr_offset = NULL_RTX;
+
+ if (GET_CODE (addr) == PLUS)
+ {
+ base = XEXP (addr, 0);
+
+ if (GET_CODE (base) == PLUS)
+ {
+ /* (SGPR + VGPR) + CONST */
+ vgpr_offset = XEXP (base, 1);
+ base = XEXP (base, 0);
+ }
+ else
+ {
+ rtx offset = XEXP (addr, 1);
+
+ if (REG_P (offset))
+ /* SGPR + VGPR */
+ vgpr_offset = offset;
+ else if (CONST_INT_P (offset))
+ /* VGPR + CONST or SGPR + CONST */
+ ;
+ else
+ output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
+ }
+ }
+
+ if (REG_P (base))
+ {
+ if (VGPR_REGNO_P (REGNO (base)))
+ print_reg (file, base);
+ else if (SGPR_REGNO_P (REGNO (base)))
+ {
+ /* The assembler requires a 64-bit VGPR pair here, even though
+ the offset should be only 32-bit. */
+ if (vgpr_offset == NULL_RTX)
+ /* In this case, the vector offset is zero, so we use v0,
+ which is initialized by the kernel prologue to zero. */
+ fprintf (file, "v[0:1]");
+ else if (REG_P (vgpr_offset)
+ && VGPR_REGNO_P (REGNO (vgpr_offset)))
+ {
+ fprintf (file, "v[%d:%d]",
+ REGNO (vgpr_offset) - FIRST_VGPR_REG,
+ REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
+ }
+ else
+ output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
+ }
+ }
+ else
+ output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
+ }
+ else if (AS_ANY_DS_P (as))
+ switch (GET_CODE (addr))
+ {
+ case REG:
+ print_reg (file, addr);
+ break;
+
+ case PLUS:
+ reg = XEXP (addr, 0);
+ print_reg (file, reg);
+ break;
+
+ default:
+ debug_rtx (addr);
+ abort ();
+ }
+ else
+ switch (GET_CODE (addr))
+ {
+ case REG:
+ print_reg (file, addr);
+ fprintf (file, ", 0");
+ break;
+
+ case PLUS:
+ reg = XEXP (addr, 0);
+ offset = XEXP (addr, 1);
+ print_reg (file, reg);
+ fprintf (file, ", ");
+ if (GET_CODE (offset) == REG)
+ print_reg (file, reg);
+ else if (GET_CODE (offset) == CONST_INT)
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+ else
+ abort ();
+ break;
+
+ default:
+ debug_rtx (addr);
+ abort ();
+ }
+}
+
+/* Implement PRINT_OPERAND via gcn.h.
+
+ b - print operand size as untyped operand (b8/b16/b32/b64)
+ B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
+ i - print operand size as untyped operand (i16/b32/i64)
+ u - print operand size as untyped operand (u16/u32/u64)
+ o - print operand size as memory access size for loads
+ (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
+ s - print operand size as memory access size for stores
+ (byte/short/dword/dwordx2/wordx3/dwordx4)
+ C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
+ D - print conditional code for s_cmp (eq_u64/lg_u64...)
+ E - print conditional code for v_cmp (eq_u64/ne_u64...)
+ A - print address in formatting suitable for given address space.
+ O - print offset:n for data share operations.
+ ^ - print "_co" suffix for GCN5 mnemonics
+ g - print "glc", if appropriate for given MEM
+ */
+
+void
+print_operand (FILE *file, rtx x, int code)
+{
+ int xcode = x ? GET_CODE (x) : 0;
+ switch (code)
+ {
+ /* Instructions have the following suffixes.
+ If there are two suffixes, the first is the destination type,
+ and the second is the source type.
+
+ B32 Bitfield (untyped data) 32-bit
+ B64 Bitfield (untyped data) 64-bit
+ F16 floating-point 16-bit
+ F32 floating-point 32-bit (IEEE 754 single-precision float)
+ F64 floating-point 64-bit (IEEE 754 double-precision float)
+ I16 signed 32-bit integer
+ I32 signed 32-bit integer
+ I64 signed 64-bit integer
+ U16 unsigned 32-bit integer
+ U32 unsigned 32-bit integer
+ U64 unsigned 64-bit integer */
+
+ /* Print operand size as untyped suffix. */
+ case 'b':
+ {
+ const char *s = "";
+ machine_mode mode = GET_MODE (x);
+ if (VECTOR_MODE_P (mode))
+ mode = GET_MODE_INNER (mode);
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 1:
+ s = "_b8";
+ break;
+ case 2:
+ s = "_b16";
+ break;
+ case 4:
+ s = "_b32";
+ break;
+ case 8:
+ s = "_b64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ fputs (s, file);
+ }
+ return;
+ case 'B':
+ {
+ const char *s = "";
+ machine_mode mode = GET_MODE (x);
+ if (VECTOR_MODE_P (mode))
+ mode = GET_MODE_INNER (mode);
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 1:
+ case 2:
+ case 4:
+ s = "_b32";
+ break;
+ case 8:
+ s = "_b64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ fputs (s, file);
+ }
+ return;
+ case 'e':
+ fputs ("sext(", file);
+ print_operand (file, x, 0);
+ fputs (")", file);
+ return;
+ case 'i':
+ case 'u':
+ {
+ bool signed_p = code == 'i';
+ const char *s = "";
+ machine_mode mode = GET_MODE (x);
+ if (VECTOR_MODE_P (mode))
+ mode = GET_MODE_INNER (mode);
+ if (mode == VOIDmode)
+ switch (GET_CODE (x))
+ {
+ case CONST_INT:
+ s = signed_p ? "_i32" : "_u32";
+ break;
+ case CONST_DOUBLE:
+ s = "_f64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ else if (FLOAT_MODE_P (mode))
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 2:
+ s = "_f16";
+ break;
+ case 4:
+ s = "_f32";
+ break;
+ case 8:
+ s = "_f64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ else
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 1:
+ s = signed_p ? "_i8" : "_u8";
+ break;
+ case 2:
+ s = signed_p ? "_i16" : "_u16";
+ break;
+ case 4:
+ s = signed_p ? "_i32" : "_u32";
+ break;
+ case 8:
+ s = signed_p ? "_i64" : "_u64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ fputs (s, file);
+ }
+ return;
+ /* Print operand size as untyped suffix. */
+ case 'o':
+ {
+ const char *s = 0;
+ switch (GET_MODE_SIZE (GET_MODE (x)))
+ {
+ case 1:
+ s = "_ubyte";
+ break;
+ case 2:
+ s = "_ushort";
+ break;
+ /* The following are full-vector variants. */
+ case 64:
+ s = "_ubyte";
+ break;
+ case 128:
+ s = "_ushort";
+ break;
+ }
+
+ if (s)
+ {
+ fputs (s, file);
+ return;
+ }
+
+ /* Fall-through - the other cases for 'o' are the same as for 's'. */
+ }
+ case 's':
+ {
+ const char *s = "";
+ switch (GET_MODE_SIZE (GET_MODE (x)))
+ {
+ case 1:
+ s = "_byte";
+ break;
+ case 2:
+ s = "_short";
+ break;
+ case 4:
+ s = "_dword";
+ break;
+ case 8:
+ s = "_dwordx2";
+ break;
+ case 12:
+ s = "_dwordx3";
+ break;
+ case 16:
+ s = "_dwordx4";
+ break;
+ case 32:
+ s = "_dwordx8";
+ break;
+ case 64:
+ s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
+ break;
+ /* The following are full-vector variants. */
+ case 128:
+ s = "_short";
+ break;
+ case 256:
+ s = "_dword";
+ break;
+ case 512:
+ s = "_dwordx2";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ fputs (s, file);
+ }
+ return;
+ case 'A':
+ if (xcode != MEM)
+ {
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ print_operand_address (file, x);
+ return;
+ case 'O':
+ {
+ if (xcode != MEM)
+ {
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ if (AS_GDS_P (MEM_ADDR_SPACE (x)))
+ fprintf (file, " gds");
+
+ rtx x0 = XEXP (x, 0);
+ if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
+ {
+ gcc_assert (TARGET_GCN5_PLUS);
+
+ fprintf (file, ", ");
+
+ rtx base = x0;
+ rtx const_offset = NULL_RTX;
+
+ if (GET_CODE (base) == PLUS)
+ {
+ rtx offset = XEXP (x0, 1);
+ base = XEXP (x0, 0);
+
+ if (GET_CODE (base) == PLUS)
+ /* (SGPR + VGPR) + CONST */
+ /* Ignore the VGPR offset for this operand. */
+ base = XEXP (base, 0);
+
+ if (CONST_INT_P (offset))
+ const_offset = XEXP (x0, 1);
+ else if (REG_P (offset))
+ /* SGPR + VGPR */
+ /* Ignore the VGPR offset for this operand. */
+ ;
+ else
+ output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
+ }
+
+ if (REG_P (base))
+ {
+ if (VGPR_REGNO_P (REGNO (base)))
+ /* The VGPR address is specified in the %A operand. */
+ fprintf (file, "off");
+ else if (SGPR_REGNO_P (REGNO (base)))
+ print_reg (file, base);
+ else
+ output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
+ }
+ else
+ output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
+
+ if (const_offset != NULL_RTX)
+ fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
+ INTVAL (const_offset));
+
+ return;
+ }
+
+ if (GET_CODE (x0) == REG)
+ return;
+ if (GET_CODE (x0) != PLUS)
+ {
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ rtx val = XEXP (x0, 1);
+ if (GET_CODE (val) == CONST_VECTOR)
+ val = CONST_VECTOR_ELT (val, 0);
+ if (GET_CODE (val) != CONST_INT)
+ {
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
+
+ }
+ return;
+ case 'C':
+ {
+ const char *s;
+ bool num = false;
+ if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
+ {
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ switch (REGNO (XEXP (x, 0)))
+ {
+ case VCCZ_REG:
+ s = "_vcc";
+ break;
+ case SCC_REG:
+ /* For some reason llvm-mc insists on scc0 instead of sccz. */
+ num = true;
+ s = "_scc";
+ break;
+ case EXECZ_REG:
+ s = "_exec";
+ break;
+ default:
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ fputs (s, file);
+ if (xcode == EQ)
+ fputc (num ? '0' : 'z', file);
+ else
+ fputs (num ? "1" : "nz", file);
+ return;
+ }
+ case 'D':
+ {
+ const char *s;
+ bool cmp_signed = false;
+ switch (xcode)
+ {
+ case EQ:
+ s = "_eq_";
+ break;
+ case NE:
+ s = "_lg_";
+ break;
+ case LT:
+ s = "_lt_";
+ cmp_signed = true;
+ break;
+ case LE:
+ s = "_le_";
+ cmp_signed = true;
+ break;
+ case GT:
+ s = "_gt_";
+ cmp_signed = true;
+ break;
+ case GE:
+ s = "_ge_";
+ cmp_signed = true;
+ break;
+ case LTU:
+ s = "_lt_";
+ break;
+ case LEU:
+ s = "_le_";
+ break;
+ case GTU:
+ s = "_gt_";
+ break;
+ case GEU:
+ s = "_ge_";
+ break;
+ default:
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ fputs (s, file);
+ fputc (cmp_signed ? 'i' : 'u', file);
+
+ machine_mode mode = GET_MODE (XEXP (x, 0));
+
+ if (mode == VOIDmode)
+ mode = GET_MODE (XEXP (x, 1));
+
+ /* If both sides are constants, then assume the instruction is in
+ SImode since s_cmp can only do integer compares. */
+ if (mode == VOIDmode)
+ mode = SImode;
+
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 4:
+ s = "32";
+ break;
+ case 8:
+ s = "64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ fputs (s, file);
+ return;
+ }
+ case 'E':
+ {
+ const char *s;
+ bool cmp_signed = false;
+ machine_mode mode = GET_MODE (XEXP (x, 0));
+
+ if (mode == VOIDmode)
+ mode = GET_MODE (XEXP (x, 1));
+
+ /* If both sides are constants, assume the instruction is in SFmode
+ if either operand is floating point, otherwise assume SImode. */
+ if (mode == VOIDmode)
+ {
+ if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
+ || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
+ mode = SFmode;
+ else
+ mode = SImode;
+ }
+
+ /* Use the same format code for vector comparisons. */
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ mode = GET_MODE_INNER (mode);
+
+ bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
+
+ switch (xcode)
+ {
+ case EQ:
+ s = "_eq_";
+ break;
+ case NE:
+ s = float_p ? "_neq_" : "_ne_";
+ break;
+ case LT:
+ s = "_lt_";
+ cmp_signed = true;
+ break;
+ case LE:
+ s = "_le_";
+ cmp_signed = true;
+ break;
+ case GT:
+ s = "_gt_";
+ cmp_signed = true;
+ break;
+ case GE:
+ s = "_ge_";
+ cmp_signed = true;
+ break;
+ case LTU:
+ s = "_lt_";
+ break;
+ case LEU:
+ s = "_le_";
+ break;
+ case GTU:
+ s = "_gt_";
+ break;
+ case GEU:
+ s = "_ge_";
+ break;
+ case ORDERED:
+ s = "_o_";
+ break;
+ case UNORDERED:
+ s = "_u_";
+ break;
+ default:
+ output_operand_lossage ("invalid %%xn code");
+ return;
+ }
+ fputs (s, file);
+ fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
+
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 1:
+ s = "32";
+ break;
+ case 2:
+ s = float_p ? "16" : "32";
+ break;
+ case 4:
+ s = "32";
+ break;
+ case 8:
+ s = "64";
+ break;
+ default:
+ output_operand_lossage ("invalid operand %%xn code");
+ return;
+ }
+ fputs (s, file);
+ return;
+ }
+ case 'L':
+ print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
+ return;
+ case 'H':
+ print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
+ return;
+ case 'R':
+ /* Print a scalar register number as an integer. Temporary hack. */
+ gcc_assert (REG_P (x));
+ fprintf (file, "%u", (int) REGNO (x));
+ return;
+ case 'V':
+ /* Print a vector register number as an integer. Temporary hack. */
+ gcc_assert (REG_P (x));
+ fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
+ return;
+ case 0:
+ if (xcode == REG)
+ print_reg (file, x);
+ else if (xcode == MEM)
+ output_address (GET_MODE (x), x);
+ else if (xcode == CONST_INT)
+ fprintf (file, "%i", (int) INTVAL (x));
+ else if (xcode == CONST_VECTOR)
+ print_operand (file, CONST_VECTOR_ELT (x, 0), code);
+ else if (xcode == CONST_DOUBLE)
+ {
+ const char *str;
+ switch (gcn_inline_fp_constant_p (x, false))
+ {
+ case 240:
+ str = "0.5";
+ break;
+ case 241:
+ str = "-0.5";
+ break;
+ case 242:
+ str = "1.0";
+ break;
+ case 243:
+ str = "-1.0";
+ break;
+ case 244:
+ str = "2.0";
+ break;
+ case 245:
+ str = "-2.0";
+ break;
+ case 246:
+ str = "4.0";
+ break;
+ case 247:
+ str = "-4.0";
+ break;
+ case 248:
+ str = "1/pi";
+ break;
+ default:
+ rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
+ ? DImode : SImode,
+ x, GET_MODE (x), 0);
+ if (x)
+ print_operand (file, ix, code);
+ else
+ output_operand_lossage ("invlaid fp constant");
+ return;
+ break;
+ }
+ fprintf (file, str);
+ return;
+ }
+ else
+ output_addr_const (file, x);
+ return;
+ case '^':
+ if (TARGET_GCN5_PLUS)
+ fputs ("_co", file);
+ return;
+ case 'g':
+ gcc_assert (xcode == MEM);
+ if (MEM_VOLATILE_P (x))
+ fputs (" glc", file);
+ return;
+ default:
+ output_operand_lossage ("invalid %%xn code");
+ }
+ gcc_unreachable ();
+}
+
+/* Return a hash value calculated from NAME. Used by
+ ASM_FORMAT_PRIVATE_NAME. */
+
+unsigned int
+gcn_local_sym_hash (const char *name)
+{
+ unsigned int val = 0;
+
+ if (!name)
+ return 0;
+
+ for (int i = 0; name[i]; i++)
+ val = val * 223 + name[i];
+
+ return val;
+}
+
+static tree
+gcn_mangle_decl_assembler_name (tree decl, tree id)
+{
+ if (TREE_CODE (decl) == VAR_DECL
+ && TREE_STATIC (decl)
+ && !TREE_PUBLIC (decl)
+ && local_symbol_id
+ && *local_symbol_id)
+ {
+ const char *name = IDENTIFIER_POINTER (id);
+ char *newname = (char *) alloca (strlen (name) + 16);
+
+ sprintf (newname, "%s.%.8x", name, gcn_local_sym_hash (local_symbol_id));
+
+ return get_identifier (newname);
+ }
+ else
+ return default_mangle_decl_assembler_name (decl, id);
+}
+
+/* }}} */
+/* {{{ TARGET hook overrides. */
+
+#undef TARGET_ADDR_SPACE_ADDRESS_MODE
+#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
+#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
+#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
+ gcn_addr_space_legitimate_address_p
+#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
+#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
+#undef TARGET_ADDR_SPACE_POINTER_MODE
+#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
+#undef TARGET_ADDR_SPACE_SUBSET_P
+#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
+#undef TARGET_ADDR_SPACE_CONVERT
+#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
+#undef TARGET_ARG_PARTIAL_BYTES
+#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
+#undef TARGET_ASM_ALIGNED_DI_OP
+#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
+#undef TARGET_ASM_CONSTRUCTOR
+#define TARGET_ASM_CONSTRUCTOR gcn_disable_constructors
+#undef TARGET_ASM_DESTRUCTOR
+#define TARGET_ASM_DESTRUCTOR gcn_disable_constructors
+#undef TARGET_ASM_FILE_START
+#define TARGET_ASM_FILE_START output_file_start
+#undef TARGET_ASM_FUNCTION_PROLOGUE
+#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
+#undef TARGET_ASM_SELECT_SECTION
+#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
+#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
+#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
+#undef TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL gcn_builtin_decl
+#undef TARGET_CAN_CHANGE_MODE_CLASS
+#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
+#undef TARGET_CAN_ELIMINATE
+#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
+#undef TARGET_CANNOT_COPY_INSN_P
+#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
+#undef TARGET_CLASS_LIKELY_SPILLED_P
+#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
+#undef TARGET_CLASS_MAX_NREGS
+#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
+#undef TARGET_CONSTANT_ALIGNMENT
+#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
+#undef TARGET_DEBUG_UNWIND_INFO
+#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
+#undef TARGET_EXPAND_BUILTIN
+#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
+#undef TARGET_FUNCTION_ARG
+#undef TARGET_FUNCTION_ARG_ADVANCE
+#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
+#define TARGET_FUNCTION_ARG gcn_function_arg
+#undef TARGET_FUNCTION_VALUE
+#define TARGET_FUNCTION_VALUE gcn_function_value
+#undef TARGET_FUNCTION_VALUE_REGNO_P
+#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
+#undef TARGET_GIMPLIFY_VA_ARG_EXPR
+#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
+#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
+#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
+ gcn_goacc_adjust_propagation_record
+#undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
+#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
+#undef TARGET_GOACC_FORK_JOIN
+#define TARGET_GOACC_FORK_JOIN gcn_fork_join
+#undef TARGET_GOACC_REDUCTION
+#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
+#undef TARGET_GOACC_VALIDATE_DIMS
+#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
+#undef TARGET_GOACC_WORKER_PARTITIONING
+#define TARGET_GOACC_WORKER_PARTITIONING true
+#undef TARGET_HARD_REGNO_MODE_OK
+#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
+#undef TARGET_HARD_REGNO_NREGS
+#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
+#undef TARGET_INIT_BUILTINS
+#define TARGET_INIT_BUILTINS gcn_init_builtins
+#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
+#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
+ gcn_ira_change_pseudo_allocno_class
+#undef TARGET_LEGITIMATE_COMBINED_INSN
+#define TARGET_LEGITIMATE_COMBINED_INSN gcn_legitimate_combined_insn
+#undef TARGET_LEGITIMATE_CONSTANT_P
+#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
+#undef TARGET_LRA_P
+#define TARGET_LRA_P hook_bool_void_true
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
+#undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
+#define TARGET_MANGLE_DECL_ASSEMBLER_NAME gcn_mangle_decl_assembler_name
+#undef TARGET_MEMORY_MOVE_COST
+#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
+#undef TARGET_MODES_TIEABLE_P
+#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
+#undef TARGET_OPTION_OVERRIDE
+#define TARGET_OPTION_OVERRIDE gcn_option_override
+#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
+#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
+ gcn_pretend_outgoing_varargs_named
+#undef TARGET_PROMOTE_FUNCTION_MODE
+#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
+#undef TARGET_REGISTER_MOVE_COST
+#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
+#undef TARGET_RETURN_IN_MEMORY
+#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
+#undef TARGET_RTX_COSTS
+#define TARGET_RTX_COSTS gcn_rtx_costs
+#undef TARGET_SECONDARY_RELOAD
+#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
+#undef TARGET_SECTION_TYPE_FLAGS
+#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
+#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
+#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
+ gcn_small_register_classes_for_mode_p
+#undef TARGET_SPILL_CLASS
+#define TARGET_SPILL_CLASS gcn_spill_class
+#undef TARGET_STRICT_ARGUMENT_NAMING
+#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
+#undef TARGET_TRAMPOLINE_INIT
+#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
+#undef TARGET_TRULY_NOOP_TRUNCATION
+#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
+#undef TARGET_VECTOR_ALIGNMENT
+#define TARGET_VECTOR_ALIGNMENT gcn_vector_alignment
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
+#undef TARGET_VECTORIZE_GET_MASK_MODE
+#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+ gcn_vectorize_support_vector_misalignment
+#undef TARGET_VECTORIZE_VEC_PERM_CONST
+#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
+#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
+#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
+ gcn_vector_alignment_reachable
+#undef TARGET_VECTOR_MODE_SUPPORTED_P
+#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
+
+struct gcc_target targetm = TARGET_INITIALIZER;
+
+#include "gt-gcn.h"
+/* }}} */
new file mode 100644
@@ -0,0 +1,670 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config/gcn/gcn-opts.h"
+
+#define TARGET_CPU_CPP_BUILTINS() \
+ do \
+ { \
+ builtin_define ("__AMDGCN__"); \
+ } \
+ while(0)
+
+/* Support for a compile-time default architecture and tuning.
+ The rules are:
+ --with-arch is ignored if -march is specified.
+ --with-tune is ignored if -mtune is specified. */
+#define OPTION_DEFAULT_SPECS \
+ {"arch", "%{!march=*:-march=%(VALUE)}" }, \
+ {"tune", "%{!mtune=*:-mtune=%(VALUE)}" }
+
+/* Default target_flags if no switches specified. */
+#ifndef TARGET_DEFAULT
+#define TARGET_DEFAULT 0
+#endif
+
+
+/* Storage Layout */
+#define BITS_BIG_ENDIAN 0
+#define BYTES_BIG_ENDIAN 0
+#define WORDS_BIG_ENDIAN 0
+
+#define BITS_PER_WORD 32
+#define UNITS_PER_WORD (BITS_PER_WORD/BITS_PER_UNIT)
+#define LIBGCC2_UNITS_PER_WORD 4
+
+#define POINTER_SIZE 64
+#define PARM_BOUNDARY 64
+#define STACK_BOUNDARY 64
+#define FUNCTION_BOUNDARY 32
+#define BIGGEST_ALIGNMENT 64
+#define EMPTY_FIELD_BOUNDARY 32
+#define MAX_FIXED_MODE_SIZE 64
+#define MAX_REGS_PER_ADDRESS 2
+#define STACK_SIZE_MODE DImode
+#define Pmode DImode
+#define CASE_VECTOR_MODE DImode
+#define FUNCTION_MODE QImode
+
+#define DATA_ALIGNMENT(TYPE,ALIGN) ((ALIGN) > 128 ? (ALIGN) : 128)
+#define LOCAL_ALIGNMENT(TYPE,ALIGN) ((ALIGN) > 64 ? (ALIGN) : 64)
+#define STACK_SLOT_ALIGNMENT(TYPE,MODE,ALIGN) ((ALIGN) > 64 ? (ALIGN) : 64)
+#define STRICT_ALIGNMENT 1
+
+/* Type Layout: match what x86_64 does. */
+#define INT_TYPE_SIZE 32
+#define LONG_TYPE_SIZE 64
+#define LONG_LONG_TYPE_SIZE 64
+#define FLOAT_TYPE_SIZE 32
+#define DOUBLE_TYPE_SIZE 64
+#define LONG_DOUBLE_TYPE_SIZE 64
+#define DEFAULT_SIGNED_CHAR 1
+#define PCC_BITFIELD_TYPE_MATTERS 1
+
+/* Frame Layout */
+#define FRAME_GROWS_DOWNWARD 0
+#define ARGS_GROW_DOWNWARD 1
+#define STACK_POINTER_OFFSET 0
+#define FIRST_PARM_OFFSET(FNDECL) 0
+#define DYNAMIC_CHAIN_ADDRESS(FP) plus_constant (Pmode, (FP), -16)
+#define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, LINK_REGNUM)
+#define STACK_DYNAMIC_OFFSET(FNDECL) (-crtl->outgoing_args_size)
+#define ACCUMULATE_OUTGOING_ARGS 1
+#define RETURN_ADDR_RTX(COUNT,FRAMEADDR) \
+ ((COUNT) == 0 ? get_hard_reg_initial_val (Pmode, LINK_REGNUM) : NULL_RTX)
+
+/* Register Basics */
+#define FIRST_SGPR_REG 0
+#define SGPR_REGNO(N) ((N)+FIRST_SGPR_REG)
+#define LAST_SGPR_REG 101
+
+#define FLAT_SCRATCH_REG 102
+#define FLAT_SCRATCH_LO_REG 102
+#define FLAT_SCRATCH_HI_REG 103
+#define XNACK_MASK_REG 104
+#define XNACK_MASK_LO_REG 104
+#define XNACK_MASK_HI_REG 105
+#define VCC_LO_REG 106
+#define VCC_HI_REG 107
+#define VCCZ_REG 108
+#define TBA_REG 109
+#define TBA_LO_REG 109
+#define TBA_HI_REG 110
+#define TMA_REG 111
+#define TMA_LO_REG 111
+#define TMA_HI_REG 112
+#define TTMP0_REG 113
+#define TTMP11_REG 124
+#define M0_REG 125
+#define EXEC_REG 126
+#define EXEC_LO_REG 126
+#define EXEC_HI_REG 127
+#define EXECZ_REG 128
+#define SCC_REG 129
+/* 132-159 are reserved to simplify masks. */
+#define FIRST_VGPR_REG 160
+#define VGPR_REGNO(N) ((N)+FIRST_VGPR_REG)
+#define LAST_VGPR_REG 415
+
+/* Frame Registers, and other registers */
+
+#define HARD_FRAME_POINTER_REGNUM 14
+#define STACK_POINTER_REGNUM 16
+#define LINK_REGNUM 18
+#define EXEC_SAVE_REG 20
+#define CC_SAVE_REG 22
+#define RETURN_VALUE_REG 24 /* Must be divisible by 4. */
+#define STATIC_CHAIN_REGNUM 30
+#define WORK_ITEM_ID_Z_REG 162
+#define SOFT_ARG_REG 416
+#define FRAME_POINTER_REGNUM 418
+#define FIRST_PSEUDO_REGISTER 420
+
+#define FIRST_PARM_REG 24
+#define NUM_PARM_REGS 6
+
+/* There is no arg pointer. Just choose random fixed register that does
+ not intefere with anything. */
+#define ARG_POINTER_REGNUM SOFT_ARG_REG
+
+#define HARD_FRAME_POINTER_IS_ARG_POINTER 0
+#define HARD_FRAME_POINTER_IS_FRAME_POINTER 0
+
+#define SGPR_OR_VGPR_REGNO_P(N) ((N)>=FIRST_VGPR_REG && (N) <= LAST_SGPR_REG)
+#define SGPR_REGNO_P(N) ((N) <= LAST_SGPR_REG)
+#define VGPR_REGNO_P(N) ((N)>=FIRST_VGPR_REG && (N) <= LAST_VGPR_REG)
+#define SSRC_REGNO_P(N) ((N) <= SCC_REG && (N) != VCCZ_REG)
+#define SDST_REGNO_P(N) ((N) <= EXEC_HI_REG && (N) != VCCZ_REG)
+#define CC_REG_P(X) (REG_P (X) && CC_REGNO_P (REGNO (X)))
+#define CC_REGNO_P(X) ((X) == SCC_REG || (X) == VCC_REG)
+#define FUNCTION_ARG_REGNO_P(N) \
+ ((N) >= FIRST_PARM_REG && (N) < (FIRST_PARM_REG + NUM_PARM_REGS))
+
+
+#define FIXED_REGISTERS { \
+ /* Scalars. */ \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+/* fp sp lr. */ \
+ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, \
+/* exec_save, cc_save */ \
+ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, \
+ /* Special regs and padding. */ \
+/* flat xnack vcc tba tma ttmp */ \
+ 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+/* m0 exec scc */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ /* VGRPs */ \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ /* Other registers. */ \
+ 1, 1, 1, 1 \
+}
+
+#define CALL_USED_REGISTERS { \
+ /* Scalars. */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, \
+ /* Special regs and padding. */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ /* VGRPs */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ /* Other registers. */ \
+ 1, 1, 1, 1 \
+}
+
+
+/* This returns true if the register has a special purpose on the
+ architecture, but is not fixed. */
+#define SPECIAL_REGNO_P(REGNO) \
+ ((REGNO) == SCC_REG || (REGNO) == VCC_LO_REG || (REGNO) == VCC_HI_REG \
+ || (REGNO) == EXEC_LO_REG || (REGNO) == EXEC_HI_REG)
+
+#define HARD_REGNO_RENAME_OK(FROM, TO) \
+ gcn_hard_regno_rename_ok (FROM, TO)
+
+#define HARD_REGNO_CALLER_SAVE_MODE(HARDREG, NREGS, MODE) \
+ gcn_hard_regno_caller_save_mode ((HARDREG), (NREGS), (MODE))
+
+/* Register Classes */
+
+enum reg_class
+{
+ NO_REGS,
+
+ /* SCC */
+ SCC_CONDITIONAL_REG,
+
+ /* VCCZ */
+ VCCZ_CONDITIONAL_REG,
+
+ /* VCC */
+ VCC_CONDITIONAL_REG,
+
+ /* EXECZ */
+ EXECZ_CONDITIONAL_REG,
+
+ /* SCC VCCZ EXECZ */
+ ALL_CONDITIONAL_REGS,
+
+ /* EXEC */
+ EXEC_MASK_REG,
+
+ /* SGPR0-101 */
+ SGPR_REGS,
+
+ /* SGPR0-101 EXEC_LO/EXEC_HI */
+ SGPR_EXEC_REGS,
+
+ /* SGPR0-101, VCC LO/HI, TBA LO/HI, TMA LO/HI, TTMP0-11, M0, EXEC LO/HI,
+ VCCZ, EXECZ, SCC
+ FIXME: Maybe manual has bug and FLAT_SCRATCH is OK. */
+ SGPR_VOP3A_SRC_REGS,
+
+ /* SGPR0-101, FLAT_SCRATCH_LO/HI, XNACK_MASK_LO/HI, VCC LO/HI, TBA LO/HI
+ TMA LO/HI, TTMP0-11 */
+ SGPR_MEM_SRC_REGS,
+
+ /* SGPR0-101, FLAT_SCRATCH_LO/HI, XNACK_MASK_LO/HI, VCC LO/HI, TBA LO/HI
+ TMA LO/HI, TTMP0-11, M0, EXEC LO/HI */
+ SGPR_DST_REGS,
+
+ /* SGPR0-101, FLAT_SCRATCH_LO/HI, XNACK_MASK_LO/HI, VCC LO/HI, TBA LO/HI
+ TMA LO/HI, TTMP0-11 */
+ SGPR_SRC_REGS,
+ GENERAL_REGS,
+ VGPR_REGS,
+ ALL_GPR_REGS,
+ SRCDST_REGS,
+ AFP_REGS,
+ ALL_REGS,
+ LIM_REG_CLASSES
+};
+
+#define N_REG_CLASSES (int) LIM_REG_CLASSES
+
+#define REG_CLASS_NAMES \
+{ "NO_REGS", \
+ "SCC_CONDITIONAL_REG", \
+ "VCCZ_CONDITIONAL_REG", \
+ "VCC_CONDITIONAL_REG", \
+ "EXECZ_CONDITIONAL_REG", \
+ "ALL_CONDITIONAL_REGS", \
+ "EXEC_MASK_REG", \
+ "SGPR_REGS", \
+ "SGPR_EXEC_REGS", \
+ "SGPR_VOP3A_SRC_REGS", \
+ "SGPR_MEM_SRC_REGS", \
+ "SGPR_DST_REGS", \
+ "SGPR_SRC_REGS", \
+ "GENERAL_REGS", \
+ "VGPR_REGS", \
+ "ALL_GPR_REGS", \
+ "SRCDST_REGS", \
+ "AFP_REGS", \
+ "ALL_REGS" \
+}
+
+#define NAMED_REG_MASK(N) (1<<((N)-3*32))
+#define NAMED_REG_MASK2(N) (1<<((N)-4*32))
+
+#define REG_CLASS_CONTENTS { \
+ /* NO_REGS. */ \
+ {0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SCC_CONDITIONAL_REG. */ \
+ {0, 0, 0, 0, \
+ NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, 0}, \
+ /* VCCZ_CONDITIONAL_REG. */ \
+ {0, 0, 0, NAMED_REG_MASK (VCCZ_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* VCC_CONDITIONAL_REG. */ \
+ {0, 0, 0, NAMED_REG_MASK (VCC_LO_REG)|NAMED_REG_MASK (VCC_HI_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* EXECZ_CONDITIONAL_REG. */ \
+ {0, 0, 0, 0, \
+ NAMED_REG_MASK2 (EXECZ_REG), 0, 0, 0, \
+ 0, 0, 0, 0, 0}, \
+ /* ALL_CONDITIONAL_REGS. */ \
+ {0, 0, 0, NAMED_REG_MASK (VCCZ_REG), \
+ NAMED_REG_MASK2 (EXECZ_REG) | NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* EXEC_MASK_REG. */ \
+ {0, 0, 0, NAMED_REG_MASK (EXEC_LO_REG) | NAMED_REG_MASK (EXEC_HI_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SGPR_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, 0xf1, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SGPR_EXEC_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xf1 | NAMED_REG_MASK (EXEC_LO_REG) | NAMED_REG_MASK (EXEC_HI_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SGPR_VOP3A_SRC_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff \
+ -NAMED_REG_MASK (FLAT_SCRATCH_LO_REG) \
+ -NAMED_REG_MASK (FLAT_SCRATCH_HI_REG) \
+ -NAMED_REG_MASK (XNACK_MASK_LO_REG) \
+ -NAMED_REG_MASK (XNACK_MASK_HI_REG), \
+ NAMED_REG_MASK2 (EXECZ_REG) | NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SGPR_MEM_SRC_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff-NAMED_REG_MASK (VCCZ_REG)-NAMED_REG_MASK (M0_REG) \
+ -NAMED_REG_MASK (EXEC_LO_REG)-NAMED_REG_MASK (EXEC_HI_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SGPR_DST_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff-NAMED_REG_MASK (VCCZ_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* SGPR_SRC_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ NAMED_REG_MASK2 (EXECZ_REG) | NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* GENERAL_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, 0xf1, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* VGPR_REGS. */ \
+ {0, 0, 0, 0, \
+ 0, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
+ /* ALL_GPR_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, 0xf1, \
+ 0, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
+ /* SRCDST_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff-NAMED_REG_MASK (VCCZ_REG), \
+ 0, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
+ /* AFP_REGS. */ \
+ {0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0xf}, \
+ /* ALL_REGS. */ \
+ {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0 }}
+
+#define REGNO_REG_CLASS(REGNO) gcn_regno_reg_class (REGNO)
+#define MODE_CODE_BASE_REG_CLASS(MODE, AS, OUTER, INDEX) \
+ gcn_mode_code_base_reg_class (MODE, AS, OUTER, INDEX)
+#define REGNO_MODE_CODE_OK_FOR_BASE_P(NUM, MODE, AS, OUTER, INDEX) \
+ gcn_regno_mode_code_ok_for_base_p (NUM, MODE, AS, OUTER, INDEX)
+#define INDEX_REG_CLASS VGPR_REGS
+#define REGNO_OK_FOR_INDEX_P(regno) regno_ok_for_index_p (regno)
+
+
+/* Address spaces. */
+enum gcn_address_spaces
+{
+ ADDR_SPACE_DEFAULT = 0,
+ ADDR_SPACE_FLAT,
+ ADDR_SPACE_SCALAR_FLAT,
+ ADDR_SPACE_FLAT_SCRATCH,
+ ADDR_SPACE_LDS,
+ ADDR_SPACE_GDS,
+ ADDR_SPACE_SCRATCH,
+ ADDR_SPACE_GLOBAL
+};
+#define REGISTER_TARGET_PRAGMAS() do { \
+ c_register_addr_space ("__flat", ADDR_SPACE_FLAT); \
+ c_register_addr_space ("__flat_scratch", ADDR_SPACE_FLAT_SCRATCH); \
+ c_register_addr_space ("__scalar_flat", ADDR_SPACE_SCALAR_FLAT); \
+ c_register_addr_space ("__lds", ADDR_SPACE_LDS); \
+ c_register_addr_space ("__gds", ADDR_SPACE_GDS); \
+ c_register_addr_space ("__global", ADDR_SPACE_GLOBAL); \
+} while (0);
+
+#define STACK_ADDR_SPACE \
+ (TARGET_GCN5_PLUS ? ADDR_SPACE_GLOBAL : ADDR_SPACE_FLAT)
+#define DEFAULT_ADDR_SPACE \
+ ((cfun && cfun->machine && !cfun->machine->use_flat_addressing) \
+ ? ADDR_SPACE_GLOBAL : ADDR_SPACE_FLAT)
+#define AS_SCALAR_FLAT_P(AS) ((AS) == ADDR_SPACE_SCALAR_FLAT)
+#define AS_FLAT_SCRATCH_P(AS) ((AS) == ADDR_SPACE_FLAT_SCRATCH)
+#define AS_FLAT_P(AS) ((AS) == ADDR_SPACE_FLAT \
+ || ((AS) == ADDR_SPACE_DEFAULT \
+ && DEFAULT_ADDR_SPACE == ADDR_SPACE_FLAT))
+#define AS_LDS_P(AS) ((AS) == ADDR_SPACE_LDS)
+#define AS_GDS_P(AS) ((AS) == ADDR_SPACE_GDS)
+#define AS_SCRATCH_P(AS) ((AS) == ADDR_SPACE_SCRATCH)
+#define AS_GLOBAL_P(AS) ((AS) == ADDR_SPACE_GLOBAL \
+ || ((AS) == ADDR_SPACE_DEFAULT \
+ && DEFAULT_ADDR_SPACE == ADDR_SPACE_GLOBAL))
+#define AS_ANY_FLAT_P(AS) (AS_FLAT_SCRATCH_P (AS) || AS_FLAT_P (AS))
+#define AS_ANY_DS_P(AS) (AS_LDS_P (AS) || AS_GDS_P (AS))
+
+
+/* Instruction Output */
+#define REGISTER_NAMES \
+ {"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", \
+ "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20", \
+ "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", \
+ "s31", "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39", "s40", \
+ "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49", "s50", \
+ "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", "s59", "s60", \
+ "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69", "s70", \
+ "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", \
+ "s81", "s82", "s83", "s84", "s85", "s86", "s87", "s88", "s89", "s90", \
+ "s91", "s92", "s93", "s94", "s95", "s96", "s97", "s98", "s99", \
+ "s100", "s101", \
+ "flat_scratch_lo", "flat_scratch_hi", "xnack_mask_lo", "xnack_mask_hi", \
+ "vcc_lo", "vcc_hi", "vccz", "tba_lo", "tba_hi", "tma_lo", "tma_hi", \
+ "ttmp0", "ttmp1", "ttmp2", "ttmp3", "ttmp4", "ttmp5", "ttmp6", "ttmp7", \
+ "ttmp8", "ttmp9", "ttmp10", "ttmp11", "m0", "exec_lo", "exec_hi", \
+ "execz", "scc", \
+ "res130", "res131", "res132", "res133", "res134", "res135", "res136", \
+ "res137", "res138", "res139", "res140", "res141", "res142", "res143", \
+ "res144", "res145", "res146", "res147", "res148", "res149", "res150", \
+ "res151", "res152", "res153", "res154", "res155", "res156", "res157", \
+ "res158", "res159", \
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", \
+ "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", \
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", \
+ "v31", "v32", "v33", "v34", "v35", "v36", "v37", "v38", "v39", "v40", \
+ "v41", "v42", "v43", "v44", "v45", "v46", "v47", "v48", "v49", "v50", \
+ "v51", "v52", "v53", "v54", "v55", "v56", "v57", "v58", "v59", "v60", \
+ "v61", "v62", "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", \
+ "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80", \
+ "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", \
+ "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", "v100", \
+ "v101", "v102", "v103", "v104", "v105", "v106", "v107", "v108", "v109", \
+ "v110", "v111", "v112", "v113", "v114", "v115", "v116", "v117", "v118", \
+ "v119", "v120", "v121", "v122", "v123", "v124", "v125", "v126", "v127", \
+ "v128", "v129", "v130", "v131", "v132", "v133", "v134", "v135", "v136", \
+ "v137", "v138", "v139", "v140", "v141", "v142", "v143", "v144", "v145", \
+ "v146", "v147", "v148", "v149", "v150", "v151", "v152", "v153", "v154", \
+ "v155", "v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", \
+ "v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", "v172", \
+ "v173", "v174", "v175", "v176", "v177", "v178", "v179", "v180", "v181", \
+ "v182", "v183", "v184", "v185", "v186", "v187", "v188", "v189", "v190", \
+ "v191", "v192", "v193", "v194", "v195", "v196", "v197", "v198", "v199", \
+ "v200", "v201", "v202", "v203", "v204", "v205", "v206", "v207", "v208", \
+ "v209", "v210", "v211", "v212", "v213", "v214", "v215", "v216", "v217", \
+ "v218", "v219", "v220", "v221", "v222", "v223", "v224", "v225", "v226", \
+ "v227", "v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", \
+ "v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", "v244", \
+ "v245", "v246", "v247", "v248", "v249", "v250", "v251", "v252", "v253", \
+ "v254", "v255", \
+ "?ap0", "?ap1", "?fp0", "?fp1" }
+
+#define PRINT_OPERAND(FILE, X, CODE) print_operand(FILE, X, CODE)
+#define PRINT_OPERAND_ADDRESS(FILE, ADDR) print_operand_address (FILE, ADDR)
+#define PRINT_OPERAND_PUNCT_VALID_P(CODE) (CODE == '^')
+
+
+/* Register Arguments */
+
+#ifndef USED_FOR_TARGET
+
+#define GCN_KERNEL_ARG_TYPES 19
+struct GTY(()) gcn_kernel_args
+{
+ long requested;
+ int reg[GCN_KERNEL_ARG_TYPES];
+ int order[GCN_KERNEL_ARG_TYPES];
+ int nargs, nsgprs;
+};
+
+typedef struct gcn_args
+{
+ /* True if this isn't a kernel (HSA runtime entrypoint). */
+ bool normal_function;
+ tree fntype;
+ struct gcn_kernel_args args;
+ int num;
+ int offset;
+ int alignment;
+} CUMULATIVE_ARGS;
+#endif
+
+#define INIT_CUMULATIVE_ARGS(CUM,FNTYPE,LIBNAME,FNDECL,N_NAMED_ARGS) \
+ gcn_init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \
+ (N_NAMED_ARGS) != -1)
+
+
+#ifndef USED_FOR_TARGET
+
+#include "hash-table.h"
+#include "hash-map.h"
+#include "vec.h"
+
+struct GTY(()) machine_function
+{
+ struct gcn_kernel_args args;
+ int kernarg_segment_alignment;
+ int kernarg_segment_byte_size;
+ /* Frame layout info for normal functions. */
+ bool normal_function;
+ bool need_frame_pointer;
+ bool lr_needs_saving;
+ HOST_WIDE_INT outgoing_args_size;
+ HOST_WIDE_INT pretend_size;
+ HOST_WIDE_INT local_vars;
+ HOST_WIDE_INT callee_saves;
+
+ unsigned lds_allocated;
+ hash_map<tree, int> *lds_allocs;
+
+ vec<tree, va_gc> *reduc_decls;
+
+ bool use_flat_addressing;
+};
+#endif
+
+
+/* Codes for all the GCN builtins. */
+
+enum gcn_builtin_codes
+{
+#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
+ GCN_BUILTIN_ ## fcode,
+#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
+ GCN_BUILTIN_ ## fcode ## _V64SI, \
+ GCN_BUILTIN_ ## fcode ## _V64SI_unspec,
+#include "gcn-builtins.def"
+#undef DEF_BUILTIN
+#undef DEF_BUILTIN_BINOP_INT_FP
+ GCN_BUILTIN_MAX
+};
+
+
+/* Misc */
+
+/* We can load/store 128-bit quantities, but having this larger than
+ MAX_FIXED_MODE_SIZE (which we want to be 64 bits) causes problems. */
+#define MOVE_MAX 8
+
+#define AVOID_CCMODE_COPIES 1
+#define SLOW_BYTE_ACCESS 0
+#define WORD_REGISTER_OPERATIONS 1
+
+/* Definitions for register eliminations.
+
+ This is an array of structures. Each structure initializes one pair
+ of eliminable registers. The "from" register number is given first,
+ followed by "to". Eliminations of the same "from" register are listed
+ in order of preference. */
+
+#define ELIMINABLE_REGS \
+{{ ARG_POINTER_REGNUM, STACK_POINTER_REGNUM }, \
+ { ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM }, \
+ { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM }, \
+ { FRAME_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM }}
+
+/* Define the offset between two registers, one to be eliminated, and the
+ other its replacement, at the start of a routine. */
+
+#define INITIAL_ELIMINATION_OFFSET(FROM, TO, OFFSET) \
+ ((OFFSET) = gcn_initial_elimination_offset ((FROM), (TO)))
+
+
+/* Define this macro if it is advisable to hold scalars in registers
+ in a wider mode than that declared by the program. In such cases,
+ the value is constrained to be within the bounds of the declared
+ type, but kept valid in the wider mode. The signedness of the
+ extension may differ from that of the type. */
+
+#define PROMOTE_MODE(MODE,UNSIGNEDP,TYPE) \
+ if (GET_MODE_CLASS (MODE) == MODE_INT \
+ && (TYPE == NULL || TREE_CODE (TYPE) != VECTOR_TYPE) \
+ && GET_MODE_SIZE (MODE) < UNITS_PER_WORD) \
+ { \
+ (MODE) = SImode; \
+ }
+
+/* This needs to match gcn_function_value. */
+#define LIBCALL_VALUE(MODE) gen_rtx_REG (MODE, SGPR_REGNO (RETURN_VALUE_REG))
+
+
+/* Costs. */
+
+/* Branches are to be dicouraged when theres an alternative.
+ FIXME: This number is plucked from the air. */
+#define BRANCH_COST(SPEED_P, PREDICABLE_P) 10
+
+
+/* Profiling */
+#define FUNCTION_PROFILER(FILE, LABELNO)
+#define NO_PROFILE_COUNTERS 1
+#define PROFILE_BEFORE_PROLOGUE 0
+
+/* Trampolines */
+#define TRAMPOLINE_SIZE 36
+#define TRAMPOLINE_ALIGNMENT 64
+
+/* Disable the "current_vector_size" feature intended for
+ AVX<->SSE switching. */
+#define TARGET_DISABLE_CURRENT_VECTOR_SIZE
new file mode 100644
@@ -0,0 +1,2199 @@
+;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;- See file "rtl.def" for documentation on define_insn, match_*, et. al.
+
+(include "predicates.md")
+(include "constraints.md")
+
+;; {{{ Constants and enums
+
+; Named registers
+(define_constants
+ [(FIRST_SGPR_REG 0)
+ (LAST_SGPR_REG 101)
+ (FLAT_SCRATCH_REG 102)
+ (FLAT_SCRATCH_LO_REG 102)
+ (FLAT_SCRATCH_HI_REG 103)
+ (XNACK_MASK_REG 104)
+ (XNACK_MASK_LO_REG 104)
+ (XNACK_MASK_HI_REG 105)
+ (VCC_REG 106)
+ (VCC_LO_REG 106)
+ (VCC_HI_REG 107)
+ (VCCZ_REG 108)
+ (TBA_REG 109)
+ (TBA_LO_REG 109)
+ (TBA_HI_REG 110)
+ (TMA_REG 111)
+ (TMA_LO_REG 111)
+ (TMA_HI_REG 112)
+ (TTMP0_REG 113)
+ (TTMP11_REG 124)
+ (M0_REG 125)
+ (EXEC_REG 126)
+ (EXEC_LO_REG 126)
+ (EXEC_HI_REG 127)
+ (EXECZ_REG 128)
+ (SCC_REG 129)
+ (FIRST_VGPR_REG 160)
+ (LAST_VGPR_REG 415)])
+
+(define_constants
+ [(SP_REGNUM 16)
+ (LR_REGNUM 18)
+ (AP_REGNUM 416)
+ (FP_REGNUM 418)])
+
+(define_c_enum "unspecv" [
+ UNSPECV_PROLOGUE_USE
+ UNSPECV_KERNEL_RETURN
+ UNSPECV_BARRIER
+ UNSPECV_ATOMIC
+ UNSPECV_ICACHE_INV])
+
+(define_c_enum "unspec" [
+ UNSPEC_VECTOR
+ UNSPEC_BPERMUTE
+ UNSPEC_SGPRBASE
+ UNSPEC_MEMORY_BARRIER
+ UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+ UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+ UNSPEC_PLUS_DPP_SHR
+ UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
+ UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
+ UNSPEC_MOV_FROM_LANE63
+ UNSPEC_GATHER
+ UNSPEC_SCATTER])
+
+;; }}}
+;; {{{ Attributes
+
+; Instruction type (encoding) as described in the ISA specification.
+; The following table summarizes possible operands of individual instruction
+; types and corresponding constraints.
+;
+; sop2 - scalar, two inputs, one output
+; ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+; vccz,execz,scc,inline immedate,fp inline immediate
+; sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+;
+; Constraints "=SD, SD", "SSA,SSB","SSB,SSA"
+;
+; sopk - scalar, inline constant input, one output
+; simm16: 16bit inline constant
+; sdst: same as sop2/ssrc0
+;
+; Constraints "=SD", "J"
+;
+; sop1 - scalar, one input, one output
+; ssrc0: same as sop2/ssrc0. FIXME: manual omit VCCZ
+; sdst: same as sop2/sdst
+;
+; Constraints "=SD", "SSA"
+;
+; sopc - scalar, two inputs, one comparsion
+; ssrc0: same as sop2/ssc0.
+;
+; Constraints "SSI,SSA","SSA,SSI"
+;
+; sopp - scalar, one constant input, one special
+; simm16
+;
+; smem - scalar memory
+; sbase: aligned pair of sgprs. Specify {size[15:0], base[47:0]} in
+; dwords
+; sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma
+; offset: sgpr or 20bit unsigned byte offset
+;
+; vop2 - vector, two inputs, one output
+; vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec,
+; inline constant -16 to -64, fp inline immediate, vccz, execz,
+; scc, lds, literal constant, vgpr0-255
+; vsrc1: vgpr0-255
+; vdst: vgpr0-255
+; Limitations: At most one SGPR, at most one constant
+; if constant is used, SGPR must be M0
+; Only SRC0 can be LDS_DIRECT
+;
+; constraints: "=v", "vBSS", "v"
+;
+; vop1 - vector, one input, one output
+; vsrc0: same as vop2/src0
+; vdst: vgpr0-255
+;
+; constraints: "=v", "vBSS"
+;
+; vopc - vector, two inputs, one comparsion output;
+; vsrc0: same as vop2/src0
+; vsrc1: vgpr0-255
+; vdst:
+;
+; constraints: "vASS", "v"
+;
+; vop3a - vector, three inputs, one output
+; vdst: vgpr0-255, for v_cmp sgpr or vcc
+; abs,clamp
+; vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec,
+; inline constant -16 to -64, fp inline immediate, vccz, execz,
+; scc, lds_direct
+; FIXME: really missing 1/pi? really 104 SGPRs
+;
+; vop3b - vector, three inputs, one vector output, one scalar output
+; vsrc0,vsrc1,vsrc2: same as vop3a vsrc0
+; vdst: vgpr0-255
+; sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
+;
+; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
+; src0: vgpr0-255
+; dst_sel: BYTE_0-3, WORD_0-1, DWORD
+; dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE
+; clamp: true/false
+; src0_sel: BYTE_0-3, WORD_0-1, DWORD
+; flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg,
+ ; src1_abs
+;
+; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops
+; src0: vgpr0-255
+; dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1,
+; wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror,
+; bcast15, bcast31
+; flags: src0_neg, src0_abs, src1_neg, src1_abs
+; bank_mask: 4-bit mask
+; row_mask: 4-bit mask
+;
+; ds - Local and global data share instructions.
+; offset0: 8-bit constant
+; offset1: 8-bit constant
+; flag: gds
+; addr: vgpr0-255
+; data0: vgpr0-255
+; data1: vgpr0-255
+; vdst: vgpr0-255
+;
+; mubuf - Untyped memory buffer operation. First word with LDS, second word
+; non-LDS.
+; offset: 12-bit constant
+; vaddr: vgpr0-255
+; vdata: vgpr0-255
+; srsrc: sgpr0-102
+; soffset: sgpr0-102
+; flags: offen, idxen, glc, lds, slc, tfe
+;
+; mtbuf - Typed memory buffer operation. Two words
+; offset: 12-bit constant
+; dfmt: 4-bit constant
+; nfmt: 3-bit constant
+; vaddr: vgpr0-255
+; vdata: vgpr0-255
+; srsrc: sgpr0-102
+; soffset: sgpr0-102
+; flags: offen, idxen, glc, lds, slc, tfe
+;
+; flat - flat or global memory operations
+; flags: glc, slc
+; addr: vgpr0-255
+; data: vgpr0-255
+; vdst: vgpr0-255
+;
+; mult - expands to multiple instructions (pseudo encoding)
+;
+; vmult - as mult, when a vector instruction is used.
+
+(define_attr "type"
+ "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
+ vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
+ (const_string "unknown"))
+
+; Set if instruction is executed in scalar or vector unit
+
+(define_attr "unit" "unknown,scalar,vector"
+ (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
+ (const_string "scalar")
+ (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
+ vop_sdwa,vop_dpp,flat,vmult")
+ (const_string "vector")]
+ (const_string "unknown")))
+
+; All vector instructions run as 64 threads as predicated by the EXEC
+; register. Scalar operations in vector register require a single lane
+; enabled, vector moves require a full set of lanes enabled, and most vector
+; operations handle the lane masking themselves.
+; The md_reorg pass is responsible for ensuring that EXEC is set appropriately
+; according to the following settings:
+; auto - instruction doesn't use EXEC, or handles it itself.
+; md_reorg will inspect def/use to determine what to do.
+; single - disable all but lane zero.
+; full - enable all lanes.
+
+(define_attr "exec" "auto,single,full"
+ (const_string "auto"))
+
+; Infer the (worst-case) length from the instruction type by default. Many
+; types can have an optional immediate word following, which we include here.
+; "Multiple" types are counted as two 64-bit instructions. This is just a
+; default fallback: it can be overridden per-alternative in insn patterns for
+; greater accuracy.
+
+(define_attr "length" ""
+ (cond [(eq_attr "type" "sop1") (const_int 8)
+ (eq_attr "type" "sop2") (const_int 8)
+ (eq_attr "type" "sopk") (const_int 8)
+ (eq_attr "type" "sopc") (const_int 8)
+ (eq_attr "type" "sopp") (const_int 4)
+ (eq_attr "type" "smem") (const_int 8)
+ (eq_attr "type" "ds") (const_int 8)
+ (eq_attr "type" "vop1") (const_int 8)
+ (eq_attr "type" "vop2") (const_int 8)
+ (eq_attr "type" "vopc") (const_int 8)
+ (eq_attr "type" "vop3a") (const_int 8)
+ (eq_attr "type" "vop3b") (const_int 8)
+ (eq_attr "type" "vop_sdwa") (const_int 8)
+ (eq_attr "type" "vop_dpp") (const_int 8)
+ (eq_attr "type" "flat") (const_int 8)
+ (eq_attr "type" "mult") (const_int 16)
+ (eq_attr "type" "vmult") (const_int 16)]
+ (const_int 4)))
+
+; Disable alternatives that only apply to specific ISA variants.
+
+(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
+
+(define_attr "enabled" ""
+ (cond [(eq_attr "gcn_version" "gcn3") (const_int 1)
+ (and (eq_attr "gcn_version" "gcn5")
+ (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
+ (const_int 1)]
+ (const_int 0)))
+
+; We need to be able to identify v_readlane and v_writelane with
+; SGPR lane selection in order to handle "Manually Inserted Wait States".
+
+(define_attr "laneselect" "yes,no" (const_string "no"))
+
+;; }}}
+;; {{{ Iterators useful across the wole machine description
+
+(define_mode_iterator SIDI [SI DI])
+(define_mode_iterator SFDF [SF DF])
+(define_mode_iterator SISF [SI SF])
+(define_mode_iterator QIHI [QI HI])
+(define_mode_iterator DIDF [DI DF])
+
+;; }}}
+;; {{{ Attributes.
+
+; Translate RTX code into GCN instruction mnemonics with and without
+; suffixes such as _b32, etc.
+
+(define_code_attr mnemonic
+ [(minus "sub%i")
+ (plus "add%i")
+ (ashift "lshl%b")
+ (lshiftrt "lshr%b")
+ (ashiftrt "ashr%i")
+ (and "and%B")
+ (ior "or%B")
+ (xor "xor%B")
+ (mult "mul%i")
+ (smin "min%i")
+ (smax "max%i")
+ (umin "min%u")
+ (umax "max%u")
+ (not "not%b")
+ (popcount "bcnt_u32%b")])
+
+(define_code_attr bare_mnemonic
+ [(plus "add")
+ (minus "sub")
+ (and "and")
+ (ior "or")
+ (xor "xor")])
+
+(define_code_attr s_mnemonic
+ [(not "not%b")
+ (popcount "bcnt1_i32%b")])
+
+(define_code_attr revmnemonic
+ [(minus "subrev%i")
+ (ashift "lshlrev%b")
+ (lshiftrt "lshrrev%b")
+ (ashiftrt "ashrrev%i")])
+
+; Translate RTX code into corresponding expander name.
+
+(define_code_attr expander
+ [(and "and")
+ (ior "ior")
+ (xor "xor")
+ (plus "add")
+ (minus "sub")
+ (ashift "ashl")
+ (lshiftrt "lshr")
+ (ashiftrt "ashr")
+ (mult "mul")
+ (smin "smin")
+ (smax "smax")
+ (umin "umin")
+ (umax "umax")
+ (not "one_cmpl")
+ (popcount "popcount")])
+
+;; }}}
+;; {{{ Miscellaneous instructions
+
+(define_insn "nop"
+ [(const_int 0)]
+ ""
+ "s_nop\t0x0"
+ [(set_attr "type" "sopp")])
+
+; FIXME: What should the value of the immediate be? Zero is disallowed, so
+; pick 1 for now.
+(define_insn "trap"
+ [(trap_if (const_int 1) (const_int 0))]
+ ""
+ "s_trap\t1"
+ [(set_attr "type" "sopp")])
+
+;; }}}
+;; {{{ Moves
+
+;; All scalar modes we support moves in.
+(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF])
+
+; This is the entry point for creating all kinds of scalar moves,
+; including reloads and symbols.
+
+(define_expand "mov<mode>"
+ [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+ (match_operand:MOV_MODE 1 "general_operand"))]
+ ""
+ {
+ if (MEM_P (operands[0]))
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+
+ if (!lra_in_progress && !reload_completed
+ && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1]))
+ {
+ /* Something is probably trying to generate a move
+ which can only work indirectly.
+ E.g. Move from LDS memory to SGPR hardreg
+ or MEM:QI to SGPR. */
+ rtx tmpreg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mov<mode> (tmpreg, operands[1]));
+ emit_insn (gen_mov<mode> (operands[0], tmpreg));
+ DONE;
+ }
+
+ if (<MODE>mode == DImode
+ && (GET_CODE (operands[1]) == SYMBOL_REF
+ || GET_CODE (operands[1]) == LABEL_REF))
+ {
+ emit_insn (gen_movdi_symbol (operands[0], operands[1]));
+ DONE;
+ }
+ })
+
+; Split invalid moves into two valid moves
+
+(define_split
+ [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+ (match_operand:MOV_MODE 1 "general_operand"))]
+ "!reload_completed && !lra_in_progress
+ && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0) (match_dup 2))]
+ {
+ operands[2] = gen_reg_rtx(<MODE>mode);
+ })
+
+; We need BImode move so we can reload flags registers.
+
+(define_insn "*movbi"
+ [(set (match_operand:BI 0 "nonimmediate_operand"
+ "=SD, v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
+ (match_operand:BI 1 "gcn_load_operand"
+ "SSA,vSSA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
+ ""
+ {
+ /* SCC as an operand is currently not accepted by the LLVM assembler, so
+ we emit bytes directly as a workaround. */
+ switch (which_alternative) {
+ case 0:
+ if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+ return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x0\;"
+ ".byte\t0x80|%R0\;"
+ ".byte\t0xbe";
+ else
+ return "s_mov_b32\t%0, %1";
+ case 1:
+ if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+ return "; v_mov_b32\t%0, %1\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x2\;"
+ ".byte\t((%V0<<1)&0xff)\;"
+ ".byte\t0x7e|(%V0>>7)";
+ else
+ return "v_mov_b32\t%0, %1";
+ case 2:
+ return "v_readlane_b32\t%0, %1, 0";
+ case 3:
+ return "s_cmpk_lg_u32\t%1, 0";
+ case 4:
+ return "v_cmp_ne_u32\tvcc, 0, %1";
+ case 5:
+ if (REGNO (operands[1]) == SCC_REG)
+ return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x0\;"
+ ".byte\t0xea\;"
+ ".byte\t0xbe\;"
+ "s_mov_b32\tvcc_hi, 0";
+ else
+ return "s_mov_b32\tvcc_lo, %1\;"
+ "s_mov_b32\tvcc_hi, 0";
+ case 6:
+ return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
+ case 7:
+ return "s_store_dword\t%1, %A0\;s_waitcnt\tlgkmcnt(0)";
+ case 8:
+ return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
+ case 9:
+ return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\t0";
+ case 10:
+ return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
+ case 11:
+ return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)";
+ default:
+ gcc_unreachable ();
+ }
+ }
+ [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
+ flat,flat")
+ (set_attr "exec" "*,single,*,*,single,*,*,*,single,single,single,single")
+ (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
+
+; 32bit move pattern
+
+(define_insn "*mov<mode>_insn"
+ [(set (match_operand:SISF 0 "nonimmediate_operand"
+ "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG, v,SD, v,RM")
+ (match_operand:SISF 1 "gcn_load_operand"
+ "SSA, J, B,RB,Sm,RS,Sm,v, v,SS,RF, v,B, v,RLRG, Y,RM, v"))]
+ ""
+ "@
+ s_mov_b32\t%0, %1
+ s_movk_i32\t%0, %1
+ s_mov_b32\t%0, %1
+ s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+ s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\tlgkmcnt(0)
+ s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ s_store_dword\t%1, %A0\;s_waitcnt\tlgkmcnt(0)
+ v_mov_b32\t%0, %1
+ v_readlane_b32\t%0, %1, 0
+ v_writelane_b32\t%0, %1, 0
+ flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+ flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\t0
+ v_mov_b32\t%0, %1
+ ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ s_mov_b32\t%0, %1
+ global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
+ flat,vop1,ds,ds,sop1,flat,flat")
+ (set_attr "exec" "*,*,*,*,*,*,*,single,*,*,single,single,single,
+ single,single,*,single,single")
+ (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
+
+; 8/16bit move pattern
+
+(define_insn "*mov<mode>_insn"
+ [(set (match_operand:QIHI 0 "nonimmediate_operand"
+ "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG, v, v,RM")
+ (match_operand:QIHI 1 "gcn_load_operand"
+ "SSA, J, B,v, v,SS,RF, v,B, v,RLRG,RM, v"))]
+ "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+ "@
+ s_mov_b32\t%0, %1
+ s_movk_i32\t%0, %1
+ s_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ v_readlane_b32\t%0, %1, 0
+ v_writelane_b32\t%0, %1, 0
+ flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+ flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\t0
+ v_mov_b32\t%0, %1
+ ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type"
+ "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
+ (set_attr "exec" "*,*,*,single,*,*,single,single,single,single,
+ single,single,single")
+ (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")])
+
+; 64bit move pattern
+
+(define_insn_and_split "*mov<mode>_insn"
+ [(set (match_operand:DIDF 0 "nonimmediate_operand"
+ "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG, v, v,RM")
+ (match_operand:DIDF 1 "general_operand"
+ "SSA, C,DB,Sm,RS,v,DB, v,SS,RF, v, v,RLRG,RM, v"))]
+ "GET_CODE(operands[1]) != SYMBOL_REF"
+ "@
+ s_mov_b64\t%0, %1
+ s_mov_b64\t%0, %1
+ #
+ s_store_dwordx2\t%1, %A0\;s_waitcnt\tlgkmcnt(0)
+ s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ #
+ #
+ #
+ #
+ flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+ flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\t0
+ ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)"
+ "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
+ && !gcn_sgpr_move_p (operands[0], operands[1]))
+ || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))]
+ {
+ rtx inlo = gen_lowpart (SImode, operands[1]);
+ rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
+ rtx outlo = gen_lowpart (SImode, operands[0]);
+ rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
+
+ /* Ensure that overlapping registers aren't corrupted. */
+ if (REGNO (outlo) == REGNO (inhi))
+ {
+ operands[0] = outhi;
+ operands[1] = inhi;
+ operands[2] = outlo;
+ operands[3] = inlo;
+ }
+ else
+ {
+ operands[0] = outlo;
+ operands[1] = inlo;
+ operands[2] = outhi;
+ operands[3] = inhi;
+ }
+ }
+ [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat,
+ flat,ds,ds,flat,flat")
+ (set_attr "exec" "*,*,*,*,*,*,*,*,*,single,single,single,single,single,
+ single")
+ (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")])
+
+; 128-bit move.
+
+(define_insn_and_split "*movti_insn"
+ [(set (match_operand:TI 0 "nonimmediate_operand"
+ "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v")
+ (match_operand:TI 1 "general_operand"
+ "SSB,Sm,RS, v,RF,v,SS, v, v,RM, v,RL"))]
+ ""
+ "@
+ #
+ s_store_dwordx4\t%1, %A0\;s_waitcnt\tlgkmcnt(0)
+ s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\t0
+ flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+ #
+ #
+ #
+ global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\tvmcnt(0)
+ global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
+ "reload_completed
+ && REG_P (operands[0])
+ && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))
+ (set (match_dup 4) (match_dup 5))
+ (set (match_dup 6) (match_dup 7))]
+ {
+ operands[6] = gcn_operand_part (TImode, operands[0], 3);
+ operands[7] = gcn_operand_part (TImode, operands[1], 3);
+ operands[4] = gcn_operand_part (TImode, operands[0], 2);
+ operands[5] = gcn_operand_part (TImode, operands[1], 2);
+ operands[2] = gcn_operand_part (TImode, operands[0], 1);
+ operands[3] = gcn_operand_part (TImode, operands[1], 1);
+ operands[0] = gcn_operand_part (TImode, operands[0], 0);
+ operands[1] = gcn_operand_part (TImode, operands[1], 0);
+ }
+ [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
+ ds,ds")
+ (set_attr "exec" "*,*,*,single,single,*,*,*,single,single,single,single")
+ (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
+
+;; }}}
+;; {{{ Prologue/Epilogue
+
+(define_insn "prologue_use"
+ [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
+ ""
+ ""
+ [(set_attr "length" "0")])
+
+(define_expand "prologue"
+ [(const_int 0)]
+ ""
+ {
+ gcn_expand_prologue ();
+ DONE;
+ })
+
+(define_expand "epilogue"
+ [(const_int 0)]
+ ""
+ {
+ gcn_expand_epilogue ();
+ DONE;
+ })
+
+;; }}}
+;; {{{ Control flow
+
+; This pattern must satisfy simplejump_p, which means it cannot be a parallel
+; that clobbers SCC. Thus, we must preserve SCC if we're generating a long
+; branch sequence.
+
+(define_insn "jump"
+ [(set (pc)
+ (label_ref (match_operand 0)))]
+ ""
+ {
+ if (get_attr_length (insn) == 4)
+ return "s_branch\t%0";
+ else
+ /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG. */
+ return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
+ ".long\t0xbe9600fd\;"
+ "s_getpc_b64\ts[20:21]\;"
+ "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+ "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+ "s_cmpk_lg_u32\ts22, 0\;"
+ "s_setpc_b64\ts[20:21]";
+ }
+ [(set_attr "type" "sopp")
+ (set (attr "length")
+ (if_then_else (and (ge (minus (match_dup 0) (pc))
+ (const_int -131072))
+ (lt (minus (match_dup 0) (pc))
+ (const_int 131072)))
+ (const_int 4)
+ (const_int 32)))])
+
+(define_insn "indirect_jump"
+ [(set (pc)
+ (match_operand:DI 0 "register_operand" "Sg"))]
+ ""
+ "s_setpc_b64\t%0"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "4")])
+
+(define_insn "cjump"
+ [(set (pc)
+ (if_then_else
+ (match_operator:BI 1 "gcn_conditional_operator"
+ [(match_operand:BI 2 "gcn_conditional_register_operand" " ca")
+ (const_int 0)])
+ (label_ref (match_operand 0))
+ (pc)))
+ (clobber (match_scratch:BI 3 "=cs"))]
+ ""
+ {
+ if (get_attr_length (insn) == 4)
+ return "s_cbranch%C1\t%0";
+ else
+ {
+ operands[1] = gen_rtx_fmt_ee (reverse_condition
+ (GET_CODE (operands[1])),
+ BImode, operands[2], const0_rtx);
+ /* !!! This sequence clobbers EXEC_SAVE_REG and SCC. */
+ return "s_cbranch%C1\t.skip%=\;"
+ "s_getpc_b64\ts[20:21]\;"
+ "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+ "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+ "s_setpc_b64\ts[20:21]\n"
+ ".skip%=:";
+ }
+ }
+ [(set_attr "type" "sopp")
+ (set (attr "length")
+ (if_then_else (and (ge (minus (match_dup 0) (pc))
+ (const_int -131072))
+ (lt (minus (match_dup 0) (pc))
+ (const_int 131072)))
+ (const_int 4)
+ (const_int 28)))])
+
+; Returning from a normal function is different to returning from a
+; kernel function.
+
+(define_insn "gcn_return"
+ [(return)]
+ ""
+ {
+ if (cfun && cfun->machine && cfun->machine->normal_function)
+ return "s_setpc_b64\ts[18:19]";
+ else
+ return "s_dcache_wb\;s_endpgm";
+ }
+ [(set_attr "type" "sop1")
+ (set_attr "length" "8")])
+
+(define_expand "call"
+ [(parallel [(call (match_operand 0 "")
+ (match_operand 1 ""))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 2))])]
+ ""
+ {})
+
+(define_insn "gcn_simple_call"
+ [(call (mem (match_operand 0 "immediate_operand" "Y,B"))
+ (match_operand 1 "const_int_operand"))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 2 "=&Sg,X"))]
+ ""
+ "@
+ s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2
+ s_swappc_b64\ts[18:19], %0"
+ [(set_attr "type" "mult,sop1")
+ (set_attr "length" "24,4")])
+
+(define_insn "movdi_symbol"
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
+ (match_operand:DI 1 "general_operand" "Y"))
+ (clobber (reg:BI SCC_REG))]
+ "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
+ {
+ if (SYMBOL_REF_P (operands[1])
+ && SYMBOL_REF_WEAK (operands[1]))
+ return "s_getpc_b64\t%0\;"
+ "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
+ "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
+ "s_load_dwordx2\t%0, %0\;"
+ "s_waitcnt\tlgkmcnt(0)";
+
+ return "s_getpc_b64\t%0\;"
+ "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
+ "s_addc_u32\t%H0, %H0, %1@rel32@hi+4";
+ }
+ [(set_attr "type" "mult")
+ (set_attr "length" "32")])
+
+(define_insn "gcn_indirect_call"
+ [(call (mem (match_operand:DI 0 "register_operand" "Sg"))
+ (match_operand 1 "" ""))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 2 "=X"))]
+ ""
+ "s_swappc_b64\ts[18:19], %0"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "4")])
+
+(define_expand "call_value"
+ [(parallel [(set (match_operand 0 "")
+ (call (match_operand 1 "")
+ (match_operand 2 "")))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 3))])]
+ ""
+ {})
+
+(define_insn "gcn_call_value"
+ [(set (match_operand 0 "register_operand" "=Sg,Sg")
+ (call (mem (match_operand 1 "immediate_operand" "Y,B"))
+ (match_operand 2 "const_int_operand")))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 3 "=&Sg,X"))]
+ ""
+ "@
+ s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3
+ s_swappc_b64\ts[18:19], %1"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "24")])
+
+(define_insn "gcn_call_value_indirect"
+ [(set (match_operand 0 "register_operand" "=Sg")
+ (call (mem (match_operand:DI 1 "register_operand" "Sg"))
+ (match_operand 2 "" "")))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 3 "=X"))]
+ ""
+ "s_swappc_b64\ts[18:19], %1"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "4")])
+
+; GCN does not have an instruction to clear only part of the instruction
+; cache, so the operands are ignored.
+
+(define_insn "clear_icache"
+ [(unspec_volatile
+ [(match_operand 0 "") (match_operand 1 "")]
+ UNSPECV_ICACHE_INV)]
+ ""
+ "s_icache_inv"
+ [(set_attr "type" "sopp")
+ (set_attr "length" "4")])
+
+;; }}}
+;; {{{ Conditionals
+
+; 32-bit compare, scalar unit only
+
+(define_insn "cstoresi4"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand"
+ "=cs, cs, cs, cs")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:SI 2 "gcn_alu_operand" "SSA,SSA,SSB, SS")
+ (match_operand:SI 3 "gcn_alu_operand" "SSA,SSL, SS,SSB")]))]
+ ""
+ "@
+ s_cmp%D1\t%2, %3
+ s_cmpk%D1\t%2, %3
+ s_cmp%D1\t%2, %3
+ s_cmp%D1\t%2, %3"
+ [(set_attr "type" "sopc,sopk,sopk,sopk")
+ (set_attr "length" "4,4,8,8")])
+
+(define_expand "cbranchsi4"
+ [(match_operator 0 "gcn_compare_operator"
+ [(match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")])
+ (match_operand 3)]
+ ""
+ {
+ rtx cc = gen_reg_rtx (BImode);
+ emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2]));
+ emit_jump_insn (gen_cjump (operands[3],
+ gen_rtx_NE (BImode, cc, const0_rtx), cc));
+ DONE;
+ })
+
+; 64-bit compare; either unit
+
+(define_expand "cstoredi4"
+ [(parallel [(set (match_operand:BI 0 "gcn_conditional_register_operand")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:DI 2 "gcn_alu_operand")
+ (match_operand:DI 3 "gcn_alu_operand")]))
+ (use (match_dup 4))])]
+ ""
+ {
+ operands[4] = gcn_scalar_exec ();
+ })
+
+(define_insn "cstoredi4_vec_and_scalar"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs, cV")
+ (match_operator:BI 1 "gcn_compare_64bit_operator"
+ [(match_operand:DI 2 "gcn_alu_operand" "%SSA,vSSC")
+ (match_operand:DI 3 "gcn_alu_operand" " SSC, v")]))
+ (use (match_operand:DI 4 "gcn_exec_operand" " n, e"))]
+ ""
+ "@
+ s_cmp%D1\t%2, %3
+ v_cmp%E1\tvcc, %2, %3"
+ [(set_attr "type" "sopc,vopc")
+ (set_attr "length" "8")])
+
+(define_insn "cstoredi4_vector"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:DI 2 "gcn_alu_operand" "vSSB")
+ (match_operand:DI 3 "gcn_alu_operand" " v")]))
+ (use (match_operand:DI 4 "gcn_exec_operand" " e"))]
+ ""
+ "v_cmp%E1\tvcc, %2, %3"
+ [(set_attr "type" "vopc")
+ (set_attr "length" "8")])
+
+(define_expand "cbranchdi4"
+ [(match_operator 0 "gcn_compare_operator"
+ [(match_operand:DI 1 "gcn_alu_operand")
+ (match_operand:DI 2 "gcn_alu_operand")])
+ (match_operand 3)]
+ ""
+ {
+ rtx cc = gen_reg_rtx (BImode);
+ emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2]));
+ emit_jump_insn (gen_cjump (operands[3],
+ gen_rtx_NE (BImode, cc, const0_rtx), cc));
+ DONE;
+ })
+
+; FP compare; vector unit only
+
+(define_expand "cstore<mode>4"
+ [(parallel [(set (match_operand:BI 0 "gcn_conditional_register_operand")
+ (match_operator:BI 1 "gcn_fp_compare_operator"
+ [(match_operand:SFDF 2 "gcn_alu_operand")
+ (match_operand:SFDF 3 "gcn_alu_operand")]))
+ (use (match_dup 4))])]
+ ""
+ {
+ operands[4] = gcn_scalar_exec ();
+ })
+
+(define_insn "cstore<mode>4_vec_and_scalar"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV")
+ (match_operator:BI 1 "gcn_fp_compare_operator"
+ [(match_operand:SFDF 2 "gcn_alu_operand" "vB")
+ (match_operand:SFDF 3 "gcn_alu_operand" "v")]))
+ (use (match_operand:DI 4 "gcn_exec_operand" "e"))]
+ ""
+ "v_cmp%E1\tvcc, %2, %3"
+ [(set_attr "type" "vopc")
+ (set_attr "length" "8")])
+
+(define_expand "cbranch<mode>4"
+ [(match_operator 0 "gcn_fp_compare_operator"
+ [(match_operand:SFDF 1 "gcn_alu_operand")
+ (match_operand:SFDF 2 "gcn_alu_operand")])
+ (match_operand 3)]
+ ""
+ {
+ rtx cc = gen_reg_rtx (BImode);
+ emit_insn (gen_cstore<mode>4 (cc, operands[0], operands[1], operands[2]));
+ emit_jump_insn (gen_cjump (operands[3],
+ gen_rtx_NE (BImode, cc, const0_rtx), cc));
+ DONE;
+ })
+
+;; }}}
+;; {{{ ALU special cases: Plus
+
+(define_code_iterator plus_minus [plus minus])
+
+(define_predicate "plus_minus_operator"
+ (match_code "plus,minus"))
+
+(define_expand "<expander>si3"
+ [(parallel [(set (match_operand:SI 0 "register_operand")
+ (plus_minus:SI (match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")))
+ (use (match_dup 3))
+ (clobber (reg:BI SCC_REG))
+ (clobber (reg:DI VCC_REG))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+; 32-bit add; pre-reload undecided unit.
+
+(define_insn "*addsi3_vec_and_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, Sg, Sg, v")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ, B,vBSg")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, n, n, e"))
+ (clobber (reg:BI SCC_REG))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "@
+ s_add_i32\t%0, %1, %2
+ s_addk_i32\t%0, %2
+ s_add_i32\t%0, %1, %2
+ v_add_i32\t%0, %1, %2"
+ [(set_attr "type" "sop2,sopk,sop2,vop2")
+ (set_attr "length" "4,4,8,8")])
+
+; Discard VCC clobber, post reload.
+
+(define_split
+ [(set (match_operand:SIDI 0 "register_operand")
+ (match_operator:SIDI 3 "plus_minus_operator"
+ [(match_operand:SIDI 1 "gcn_alu_operand")
+ (match_operand:SIDI 2 "gcn_alu_operand")]))
+ (use (match_operand:DI 4 "" ""))
+ (clobber (reg:BI SCC_REG))
+ (clobber (reg:DI VCC_REG))]
+ "reload_completed && gcn_sdst_register_operand (operands[0], VOIDmode)"
+ [(parallel [(set (match_dup 0)
+ (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
+ (clobber (reg:BI SCC_REG))])])
+
+; Discard SCC clobber, post reload.
+; FIXME: do we have an insn for this?
+
+(define_split
+ [(set (match_operand:SIDI 0 "register_operand")
+ (match_operator:SIDI 3 "plus_minus_operator"
+ [(match_operand:SIDI 1 "gcn_alu_operand")
+ (match_operand:SIDI 2 "gcn_alu_operand")]))
+ (use (match_operand:DI 4 ""))
+ (clobber (reg:BI SCC_REG))
+ (clobber (reg:DI VCC_REG))]
+ "reload_completed && gcn_vgpr_register_operand (operands[0], VOIDmode)"
+ [(parallel [(set (match_dup 0)
+ (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
+ (use (match_dup 4))
+ (clobber (reg:DI VCC_REG))])])
+
+; 32-bit add, scalar unit.
+
+(define_insn "*addsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, Sg, Sg")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA")
+ (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ, B")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_add_i32\t%0, %1, %2
+ s_addk_i32\t%0, %2
+ s_add_i32\t%0, %1, %2"
+ [(set_attr "type" "sop2,sopk,sop2")
+ (set_attr "length" "4,4,8")])
+
+; Having this as an insn_and_split allows us to keep together DImode adds
+; through some RTL optimisation passes, and means the CC reg we set isn't
+; dependent on the constraint alternative (which doesn't seem to work well).
+
+; There's an early clobber in the case where "v[0:1]=v[1:2]+?" but
+; "v[0:1]=v[0:1]+?" is fine (as is "v[1:2]=v[0:1]+?", but that's trickier).
+
+; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be
+; used as an operand due to the read of VCC, so we restrict constants to the
+; inlinable range for that alternative.
+
+(define_insn_and_split "adddi3"
+ [(set (match_operand:DI 0 "register_operand"
+ "=&Sg,&Sg,&Sg,&Sg,&v,&v,&v,&v")
+ (plus:DI (match_operand:DI 1 "register_operand"
+ " Sg, 0, 0, Sg, v, 0, 0, v")
+ (match_operand:DI 2 "nonmemory_operand"
+ " 0,SgB, 0,SgB, 0,vA, 0,vA")))
+ (clobber (reg:BI SCC_REG))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+ {
+ rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1],
+ DImode)
+ ? VCC_REG : SCC_REG);
+
+ emit_insn (gen_addsi3_scalar_carry
+ (gcn_operand_part (DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ cc));
+ rtx val = gcn_operand_part (DImode, operands[2], 1);
+ if (val != const0_rtx)
+ emit_insn (gen_addcsi3_scalar
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1),
+ cc, cc));
+ else
+ emit_insn (gen_addcsi3_scalar_zero
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1),
+ cc));
+ DONE;
+ }
+ [(set_attr "type" "mult,mult,mult,mult,vmult,vmult,vmult,vmult")
+ (set_attr "length" "8")
+ ; FIXME: These patterns should have (use (exec)) but that messes up
+ ; the generic splitters, so use single instead
+ (set_attr "exec" "*,*,*,*,single,single,single,single")])
+
+;; Add with carry.
+
+(define_insn "addsi3_scalar_carry"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, v")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgB,vB")))
+ (set (match_operand:BI 3 "register_operand" "= cs,cV")
+ (ltu:BI (plus:SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 1)))]
+ ""
+ "@
+ s_add_u32\t%0, %1, %2
+ v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "8,8")
+ (set_attr "exec" "*,single")])
+
+(define_insn "addsi3_scalar_carry_cst"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, v")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA, v")
+ (match_operand:SI 2 "const_int_operand" " n, n")))
+ (set (match_operand:BI 4 "register_operand" "=cs,cV")
+ (geu:BI (plus:SI (match_dup 1)
+ (match_dup 2))
+ (match_operand:SI 3 "const_int_operand" " n, n")))]
+ "INTVAL (operands[2]) == -INTVAL (operands[3])"
+ "@
+ s_add_u32\t%0, %1, %2
+ v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "4")
+ (set_attr "exec" "*,single")])
+
+(define_insn "addcsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, v")
+ (plus:SI (plus:SI (zero_extend:SI
+ (match_operand:BI 3 "register_operand" "= cs,cV"))
+ (match_operand:SI 1 "gcn_alu_operand" "%SgA, v"))
+ (match_operand:SI 2 "gcn_alu_operand" " SgB,vA")))
+ (set (match_operand:BI 4 "register_operand" "= 3, 3")
+ (ior:BI (ltu:BI (plus:SI
+ (plus:SI
+ (zero_extend:SI (match_dup 3))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 2))
+ (ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "@
+ s_addc_u32\t%0, %1, %2
+ v_addc%^_u32\t%0, vcc, %2, %1, vcc"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "8,4")
+ (set_attr "exec" "*,single")])
+
+(define_insn "addcsi3_scalar_zero"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, v")
+ (plus:SI (zero_extend:SI
+ (match_operand:BI 2 "register_operand" "=cs,cV"))
+ (match_operand:SI 1 "gcn_alu_operand" "SgA, v")))
+ (set (match_dup 2)
+ (ltu:BI (plus:SI (zero_extend:SI (match_dup 2))
+ (match_dup 1))
+ (match_dup 1)))]
+ ""
+ "@
+ s_addc_u32\t%0, %1, 0
+ v_addc%^_u32\t%0, vcc, 0, %1, vcc"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "4")
+ (set_attr "exec" "*,single")])
+
+; "addptr" is the same as "add" except that it must not write to VCC or SCC
+; as a side-effect. Unfortunately GCN3 does not have a suitable instruction
+; for this, so we use a split to save and restore the condition code.
+; This pattern must use "Sg" instead of "SD" to prevent the compiler
+; assigning VCC as the destination.
+; FIXME: Provide GCN5 implementation
+
+(define_insn_and_split "addptrdi3"
+ [(set (match_operand:DI 0 "register_operand" "= Sg, &v")
+ (plus:DI (match_operand:DI 1 "register_operand" " Sg, v0")
+ (match_operand:DI 2 "nonmemory_operand" "SgDB,vDB0")))]
+ ""
+ {
+ if (which_alternative == 0)
+ return "#";
+
+ gcc_assert (!CONST_INT_P (operands[2])
+ || gcn_inline_constant64_p (operands[2]));
+
+ const char *add_insn = TARGET_GCN3 ? "v_add_u32" : "v_add_co_u32";
+ const char *addc_insn = TARGET_GCN3 ? "v_addc_u32" : "v_addc_co_u32";
+
+ rtx operand2_lo = gcn_operand_part (DImode, operands[2], 0);
+ rtx operand2_hi = gcn_operand_part (DImode, operands[2], 1);
+ rtx new_operands[4] = { operands[0], operands[1], operand2_lo,
+ gen_rtx_REG (DImode, CC_SAVE_REG) };
+ char buf[100];
+
+ sprintf (buf, "%s %%L0, %%3, %%2, %%L1", add_insn);
+ output_asm_insn (buf, new_operands);
+
+ new_operands[2] = operand2_hi;
+ sprintf (buf, "%s %%H0, %%3, %%2, %%H1, %%3", addc_insn);
+ output_asm_insn (buf, new_operands);
+
+ return "";
+ }
+ "reload_completed
+ && (!gcn_vgpr_register_operand (operands[0], DImode)
+ || (CONST_INT_P (operands[2])
+ && !gcn_inline_constant64_p (operands[2])))"
+ [(const_int 0)]
+ {
+ rtx cc_reg, cc_save_reg;
+
+ if (gcn_vgpr_register_operand (operands[1], DImode))
+ {
+ cc_reg = gen_rtx_REG (DImode, VCC_REG);
+ cc_save_reg = gen_rtx_REG (DImode, CC_SAVE_REG);
+ emit_insn (gen_movdi (cc_save_reg, cc_reg));
+ }
+ else
+ {
+ cc_reg = gen_rtx_REG (BImode, SCC_REG);
+ cc_save_reg = gen_rtx_REG (BImode, CC_SAVE_REG);
+ emit_insn (gen_movbi (cc_save_reg, cc_reg));
+ }
+
+ emit_insn (gen_adddi3 (operands[0], operands[1], operands[2]));
+
+ if (gcn_vgpr_register_operand (operands[1], DImode))
+ emit_insn (gen_movdi (cc_reg, cc_save_reg));
+ else
+ emit_insn (gen_movbi (cc_reg, cc_save_reg));
+
+ DONE;
+ }
+ [(set_attr "type" "mult,vmult")
+ (set_attr "length" "16")
+ (set_attr "exec" "*,single")])
+
+;; }}}
+;; {{{ ALU special cases: Minus
+
+;; Note that the expand and splitters are shared with add, above.
+;; See "plus_minus".
+
+(define_insn "*subsi3_vec_and_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v, v")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA, v,vBSg")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA, B, vBSg, v")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, n, e, e"))
+ (clobber (reg:BI SCC_REG))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "@
+ s_sub_i32\t%0, %1, %2
+ s_sub_i32\t%0, %1, %2
+ v_sub_i32\t%0, %1, %2
+ v_sub_i32\t%0, %1, %2"
+ [(set_attr "type" "sop2,sop2,vop2,vop2")
+ (set_attr "length" "4,8,8,8")])
+
+(define_insn "*subsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA, B")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "s_sub_i32\t%0, %1, %2"
+ [(set_attr "type" "sop2,sop2")
+ (set_attr "length" "4,8")])
+
+(define_insn_and_split "subdi3"
+ [(set (match_operand:DI 0 "register_operand" "=Sg, Sg")
+ (minus:DI
+ (match_operand:DI 1 "gcn_alu_operand" "SgA,SgB")
+ (match_operand:DI 2 "gcn_alu_operand" "SgB,SgA")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ emit_insn (gen_subsi3_scalar_carry
+ (gcn_operand_part (DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (DImode, operands[2], 0)));
+ rtx val = gcn_operand_part (DImode, operands[2], 1);
+ if (val != const0_rtx)
+ emit_insn (gen_subcsi3_scalar
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1)));
+ else
+ emit_insn (gen_subcsi3_scalar_zero
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1)));
+ DONE;
+ }
+ [(set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")
+ (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
+ (set (reg:BI SCC_REG)
+ (gtu:BI (minus:SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 1)))]
+ ""
+ "s_sub_u32\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry_cst"
+ [(set (match_operand:SI 0 "register_operand" "=Sg")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA")
+ (match_operand:SI 2 "const_int_operand" " n")))
+ (set (reg:BI SCC_REG)
+ (leu:BI (minus:SI (match_dup 1)
+ (match_dup 2))
+ (match_operand:SI 3 "const_int_operand" " n")))]
+ "INTVAL (operands[2]) == -INTVAL (operands[3])"
+ "s_sub_u32\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "4")])
+
+(define_insn "subcsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg")
+ (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB"))
+ (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
+ (set (reg:BI SCC_REG)
+ (ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 1))
+ (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "s_subb_u32\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "8")])
+
+(define_insn "subcsi3_scalar_zero"
+ [(set (match_operand:SI 0 "register_operand" "=Sg")
+ (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_operand:SI 1 "gcn_alu_operand" "SgA")))
+ (set (reg:BI SCC_REG)
+ (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1))
+ (match_dup 1)))]
+ ""
+ "s_subb_u32\t%0, %1, 0"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "4")])
+
+;; }}}
+;; {{{ ALU: mult
+
+(define_expand "mulsi3"
+ [(set (match_operand:SI 0 "register_operand")
+ (mult:SI (match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")))
+ (use (match_dup 3))]
+ ""
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long
+; immediate.
+(define_insn_and_split "*mulsi3_vec_and_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg,Sg, Sg, v")
+ (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgA, J, B,vASg")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, n, n, e"))]
+ ""
+ "@
+ #
+ #
+ #
+ v_mul_lo_i32\t%0, %1, %2"
+ "reload_completed && gcn_sdst_register_operand (operands[0], VOIDmode)"
+ [(set (match_dup 0)
+ (mult:SI (match_dup 1)
+ (match_dup 2)))]
+ {}
+ [(set_attr "type" "sop2,sopk,sop2,vop3a")
+ (set_attr "length" "4,4,8,4")])
+
+(define_insn "*mulsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg,Sg, Sg")
+ (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA")
+ (match_operand:SI 2 "gcn_alu_operand" " SgA, J, B")))]
+ ""
+ "@
+ s_mul_i32\t%0, %1, %2
+ s_mulk_i32\t%0, %2
+ s_mul_i32\t%0, %1, %2"
+ [(set_attr "type" "sop2,sopk,sop2")
+ (set_attr "length" "4,4,8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit unop
+
+(define_code_iterator vec_and_scalar_unop [not popcount])
+
+; The const0_rtx serves as a device to differentiate patterns
+(define_expand "<expander>si2"
+ [(parallel [(set (match_operand:SI 0 "register_operand")
+ (vec_and_scalar_unop:SI
+ (match_operand:SI 1 "gcn_alu_operand")))
+ (use (match_dup 2))
+ (use (match_dup 3))
+ (clobber (reg:BI SCC_REG))])]
+ ""
+ {
+ operands[2] = gcn_scalar_exec ();
+ operands[3] = const0_rtx;
+ })
+
+(define_insn "*<expander>si2"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, v")
+ (vec_and_scalar_unop:SI
+ (match_operand:SI 1 "gcn_alu_operand" "SgB,vSgB")))
+ (use (match_operand:DI 2 "gcn_exec_operand" " n, e"))
+ (use (const_int 0))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_<s_mnemonic>0\t%0, %1
+ v_<s_mnemonic>0\t%0, %1"
+ [(set_attr "type" "sop1,vop1")
+ (set_attr "length" "8")])
+
+(define_insn "*<expander>si2_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg")
+ (vec_and_scalar_unop:SI (match_operand:SI 1 "gcn_alu_operand" "SgB")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "s_<s_mnemonic>0\t%0, %1"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit binop
+
+(define_code_iterator vec_and_scalar [and ior xor ashift lshiftrt
+ ashiftrt smin smax umin umax])
+
+(define_expand "<expander>si3"
+ [(parallel [(set (match_operand:SI 0 "register_operand")
+ (vec_and_scalar:SI
+ (match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")))
+ (use (match_dup 3))
+ (clobber (reg:BI SCC_REG))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+; No plus and mult - they have variant with 16bit immediate
+; and thus are defined later.
+(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax])
+(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
+
+(define_insn "*<expander>si3"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, v")
+ (vec_and_scalar_com:SI
+ (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgB,vSgB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, e"))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ v_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "8")])
+
+(define_insn "*<expander>si3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg")
+ (vec_and_scalar_com:SI
+ (match_operand:SI 1 "register_operand" "%SgA")
+ (match_operand:SI 2 "gcn_alu_operand" " SgB")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "s_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "8")])
+
+; We expect this to be split, post-reload to remove the dependency on the
+; exec register in the scalar case.
+
+(define_insn "*<expander>si3_vec_and_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v")
+ (vec_and_scalar_nocom:SI
+ (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSgB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, n, e"))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ s_<mnemonic>0\t%0, %1, %2
+ v_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2,sop2,vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander>si3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg,Sg")
+ (vec_and_scalar_nocom:SI
+ (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ s_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2,sop2")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 64-bit
+
+(define_code_iterator vec_and_scalar64_com [and ior xor])
+
+(define_expand "<expander>di3"
+ [(parallel [(set (match_operand:DI 0 "register_operand")
+ (vec_and_scalar64_com:DI
+ (match_operand:DI 1 "gcn_alu_operand")
+ (match_operand:DI 2 "gcn_alu_operand")))
+ (use (match_dup 3))
+ (clobber (reg:BI SCC_REG))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+(define_insn_and_split "*<expander>di3_vec_and_scalar"
+ [(set (match_operand:DI 0 "register_operand" "= Sg, &v, &v")
+ (vec_and_scalar64_com:DI
+ (match_operand:DI 1 "gcn_alu_operand" "%SgA, v, 0")
+ (match_operand:DI 2 "gcn_alu_operand" " SgC,vSgB,vSgB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, e, e"))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ #
+ #"
+ "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)"
+ [(parallel [(set (match_dup 4)
+ (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6)))
+ (use (match_dup 3))
+ (clobber (reg:BI SCC_REG))])
+ (parallel [(set (match_dup 7)
+ (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9)))
+ (use (match_dup 3))
+ (clobber (reg:BI SCC_REG))])]
+ {
+ operands[4] = gcn_operand_part (DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (DImode, operands[1], 0);
+ operands[6] = gcn_operand_part (DImode, operands[2], 0);
+ operands[7] = gcn_operand_part (DImode, operands[0], 1);
+ operands[8] = gcn_operand_part (DImode, operands[1], 1);
+ operands[9] = gcn_operand_part (DImode, operands[2], 1);
+ }
+ [(set_attr "type" "sop2,vop2,vop2")
+ (set_attr "length" "8")])
+
+(define_insn "*<expander>di3_scalar"
+ [(set (match_operand:DI 0 "register_operand" "= Sg")
+ (vec_and_scalar64_com:DI
+ (match_operand:DI 1 "gcn_alu_operand" "%SgA")
+ (match_operand:DI 2 "gcn_alu_operand" " SgC")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "s_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "8")])
+
+(define_expand "<expander>di3"
+ [(parallel [(set (match_operand:DI 0 "register_operand")
+ (vec_and_scalar_nocom:DI
+ (match_operand:DI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")))
+ (clobber (reg:BI SCC_REG))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+(define_insn "*<expander>di3_vec_and_scalar"
+ [(set (match_operand:DI 0 "register_operand" "=Sg, Sg, v")
+ (vec_and_scalar_nocom:DI
+ (match_operand:DI 1 "gcn_alu_operand" "SgC,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA,SgC,vSgC")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, n, e"))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ s_<mnemonic>0\t%0, %1, %2
+ v_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2,sop2,vop2")
+ (set_attr "length" "8")])
+
+(define_insn "*<expander>di3_scalar"
+ [(set (match_operand:DI 0 "register_operand" "=Sg, Sg")
+ (vec_and_scalar_nocom:DI
+ (match_operand:DI 1 "gcn_alu_operand" "SgC,SgA")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA,SgC")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "s_<mnemonic>0\t%0, %1, %2"
+ [(set_attr "type" "sop2,sop2")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Generic splitters
+
+;; These choose the proper insn variant once we've decided on using
+;; vector or scalar ALU.
+
+; Discard (use EXEC) from scalar unops.
+
+(define_split
+ [(set (match_operand 0 "gcn_sdst_register_operand")
+ (match_operator 3 "unary_operator"
+ [(match_operand 1 "gcn_alu_operand")]))
+ (use (match_operand:DI 2 ""))
+ (use (const_int 0))]
+ "reload_completed"
+ [(set (match_dup 0) (match_op_dup 3 [(match_dup 1)]))])
+
+; Discard const0 from valu unops.
+
+(define_split
+ [(set (match_operand 0 "gcn_vgpr_register_operand")
+ (match_operator 3 "unary_operator"
+ [(match_operand 1 "gcn_alu_operand")]))
+ (use (match_operand:DI 2 ""))
+ (use (const_int 0))]
+ "reload_completed"
+ [(parallel [(set (match_dup 0)
+ (match_op_dup 3 [(match_dup 1)]))
+ (use (match_dup 2))])])
+
+; Discard (use EXEC) from scalar binops.
+
+(define_split
+ [(set (match_operand 0 "gcn_sdst_register_operand")
+ (match_operator 4 "binary_operator"
+ [(match_operand 1 "gcn_alu_operand")
+ (match_operand 2 "gcn_alu_operand")]))
+ (use (match_operand:DI 3 ""))
+ (clobber (reg:BI SCC_REG))]
+ "reload_completed"
+ [(parallel [(set (match_dup 0)
+ (match_op_dup 4 [(match_dup 1) (match_dup 2)]))
+ (clobber (reg:BI SCC_REG))])])
+
+; Discard (clobber SCC) from valu binops.
+
+(define_split
+ [(set (match_operand 0 "gcn_vgpr_register_operand")
+ (match_operator 4 "binary_operator"
+ [(match_operand 1 "gcn_alu_operand")
+ (match_operand 2 "gcn_alu_operand")]))
+ (use (match_operand:DI 3 ""))
+ (clobber (reg:BI SCC_REG))]
+ "reload_completed"
+ [(parallel [(set (match_dup 0)
+ (match_op_dup 4 [(match_dup 1) (match_dup 2)]))
+ (use (match_dup 3))])])
+
+;; }}}
+;; {{{ Atomics
+
+; Each compute unit has it's own L1 cache. The L2 cache is shared between
+; all the compute units. Any load or store instruction can skip L1 and
+; access L2 directly using the "glc" flag. Atomic instructions also skip
+; L1. The L1 cache can be flushed and invalidated using instructions.
+;
+; Therefore, in order for "acquire" and "release" atomic modes to work
+; correctly across compute units we must flush before each "release"
+; and invalidate the cache after each "acquire". It might seem like
+; invalidation could be safely done before an "acquire", but since each
+; compute unit can run up to 40 threads simultaneously, all reading values
+; into the L1 cache, this is not actually safe.
+;
+; Additionally, scalar flat instructions access L2 via a different cache
+; (the "constant cache"), so they have separate constrol instructions. We
+; do not attempt to invalidate both caches at once; instead, atomics
+; operating on scalar flat pointers will flush the constant cache, and
+; atomics operating on flat or global pointers will flush L1. It is up to
+; the programmer to get this right.
+
+(define_code_iterator atomicops [plus minus and ior xor])
+(define_mode_attr X [(SI "") (DI "_X2")])
+
+;; TODO compare_and_swap test_and_set inc dec
+;; Hardware also supports min and max, but GCC does not.
+
+(define_expand "memory_barrier"
+ [(set (match_dup 0)
+ (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+ ""
+ {
+ operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+ MEM_VOLATILE_P (operands[0]) = 1;
+ })
+
+(define_insn "*memory_barrier"
+ [(set (match_operand:BLK 0)
+ (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+ ""
+ "buffer_wbinvl1_vol"
+ [(set_attr "type" "mubuf")
+ (set_attr "length" "4")])
+
+; FIXME: These patterns have been disabled as they do not seem to work
+; reliably - they can cause hangs or incorrect results.
+; TODO: flush caches according to memory model
+(define_expand "atomic_fetch_<bare_mnemonic><mode>"
+ [(parallel [(set (match_operand:SIDI 0 "register_operand")
+ (match_operand:SIDI 1 "memory_operand"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(atomicops:SIDI
+ (match_dup 1)
+ (match_operand:SIDI 2 "register_operand"))]
+ UNSPECV_ATOMIC))
+ (use (match_operand 3 "const_int_operand"))
+ (use (match_dup 4))])]
+ "0 /* Disabled. */"
+ {
+ operands[4] = gcn_scalar_exec ();
+ })
+
+(define_insn "*atomic_fetch_<bare_mnemonic><mode>_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(atomicops:SIDI
+ (match_dup 1)
+ (match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
+ UNSPECV_ATOMIC))
+ (use (match_operand 3 "const_int_operand"))
+ (use (match_operand:DI 4 "gcn_exec_operand" " n, e, e"))]
+ "0 /* Disabled. */"
+ "@
+ s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+ flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0
+ global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+; FIXME: These patterns are disabled because the instructions don't
+; seem to work as advertised. Specifically, OMP "team distribute"
+; reductions apparently "lose" some of the writes, similar to what
+; you might expect from a concurrent non-atomic read-modify-write.
+; TODO: flush caches according to memory model
+
+(define_expand "atomic_<bare_mnemonic><mode>"
+ [(parallel [(set (match_operand:SIDI 0 "memory_operand")
+ (unspec_volatile:SIDI
+ [(atomicops:SIDI
+ (match_dup 0)
+ (match_operand:SIDI 1 "register_operand"))]
+ UNSPECV_ATOMIC))
+ (use (match_operand 2 "const_int_operand"))
+ (use (match_dup 3))])]
+ "0 /* Disabled. */"
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+(define_insn "*atomic_<bare_mnemonic><mode>_insn"
+ [(set (match_operand:SIDI 0 "memory_operand" "+RS,RF,RM")
+ (unspec_volatile:SIDI
+ [(atomicops:SIDI
+ (match_dup 0)
+ (match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
+ UNSPECV_ATOMIC))
+ (use (match_operand 2 "const_int_operand"))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, e, e"))]
+ "0 /* Disabled. */"
+ "@
+ s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
+ flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
+ global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_mode_attr x2 [(SI "DI") (DI "TI")])
+(define_mode_attr size [(SI "4") (DI "8")])
+(define_mode_attr bitsize [(SI "32") (DI "64")])
+
+(define_expand "sync_compare_and_swap<mode>"
+ [(match_operand:SIDI 0 "register_operand")
+ (match_operand:SIDI 1 "memory_operand")
+ (match_operand:SIDI 2 "register_operand")
+ (match_operand:SIDI 3 "register_operand")]
+ ""
+ {
+ if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
+ {
+ rtx exec = gcn_scalar_exec ();
+ emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
+ operands[1],
+ operands[2],
+ operands[3],
+ exec));
+ DONE;
+ }
+
+ /* Operands 2 and 3 must be placed in consecutive registers, and passed
+ as a combined value. */
+ rtx src_cmp = gen_reg_rtx (<x2>mode);
+ emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, 0), operands[3]);
+ emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, <size>), operands[2]);
+ emit_insn (gen_sync_compare_and_swap<mode>_insn (operands[0],
+ operands[1],
+ src_cmp,
+ gcn_scalar_exec ()));
+ DONE;
+ })
+
+(define_insn "sync_compare_and_swap<mode>_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
+ UNSPECV_ATOMIC))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, e, e"))]
+ ""
+ "@
+ s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+ flat_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\t0
+ global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "sync_compare_and_swap<mode>_lds_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "= v")
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 1 "memory_operand" "+RL")]
+ UNSPECV_ATOMIC))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 2 "register_operand" " v")
+ (match_operand:SIDI 3 "register_operand" " v")]
+ UNSPECV_ATOMIC))
+ (use (match_operand:DI 4 "gcn_exec_operand" " e"))]
+ ""
+ "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)"
+ [(set_attr "type" "ds")
+ (set_attr "length" "12")])
+
+(define_expand "atomic_load<mode>"
+ [(match_operand:SIDI 0 "register_operand")
+ (match_operand:SIDI 1 "memory_operand")
+ (match_operand 2 "immediate_operand")]
+ ""
+ {
+ emit_insn (gen_atomic_load<mode>_insn (operands[0], operands[1],
+ operands[2], gcn_scalar_exec ()));
+ DONE;
+ })
+
+(define_insn "atomic_load<mode>_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")]
+ UNSPECV_ATOMIC))
+ (use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, e, e"))]
+ ""
+ {
+ switch (INTVAL (operands[2]))
+ {
+ case MEMMODEL_RELAXED:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
+ case 2:
+ return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_CONSUME:
+ case MEMMODEL_ACQUIRE:
+ case MEMMODEL_SYNC_ACQUIRE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
+ "s_dcache_wb_vol";
+ case 1:
+ return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
+ "buffer_wbinvl1_vol";
+ case 2:
+ return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
+ "buffer_wbinvl1_vol";
+ }
+ break;
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ case MEMMODEL_SYNC_SEQ_CST:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
+ "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
+ "s_waitcnt\t0\;buffer_wbinvl1_vol";
+ case 2:
+ return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ }
+ gcc_unreachable ();
+ }
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "20")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_expand "atomic_store<mode>"
+ [(match_operand:SIDI 0 "memory_operand")
+ (match_operand:SIDI 1 "register_operand")
+ (match_operand 2 "immediate_operand")]
+ ""
+ {
+ emit_insn (gen_atomic_store<mode>_insn (operands[0], operands[1],
+ operands[2], gcn_scalar_exec ()));
+ DONE;
+ })
+
+(define_insn "atomic_store<mode>_insn"
+ [(set (match_operand:SIDI 0 "memory_operand" "=RS,RF,RM")
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
+ UNSPECV_ATOMIC))
+ (use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))
+ (use (match_operand:DI 3 "gcn_exec_operand" " n, e, e"))]
+ ""
+ {
+ switch (INTVAL (operands[2]))
+ {
+ case MEMMODEL_RELAXED:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0";
+ case 2:
+ return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_RELEASE:
+ case MEMMODEL_SYNC_RELEASE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+ "s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\t0";
+ case 2:
+ return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ case MEMMODEL_SYNC_SEQ_CST:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+ "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\t0\;buffer_wbinvl1_vol";
+ case 2:
+ return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ }
+ gcc_unreachable ();
+ }
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "20")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_expand "atomic_exchange<mode>"
+ [(match_operand:SIDI 0 "register_operand")
+ (match_operand:SIDI 1 "memory_operand")
+ (match_operand:SIDI 2 "register_operand")
+ (match_operand 3 "immediate_operand")]
+ ""
+ {
+ emit_insn (gen_atomic_exchange<mode>_insn (operands[0], operands[1],
+ operands[2], operands[3],
+ gcn_scalar_exec ()));
+ DONE;
+ })
+
+(define_insn "atomic_exchange<mode>_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
+ UNSPECV_ATOMIC))
+ (use (match_operand 3 "immediate_operand"))
+ (use (match_operand:DI 4 "gcn_exec_operand" " n, e, e"))]
+ ""
+ {
+ switch (INTVAL (operands[3]))
+ {
+ case MEMMODEL_RELAXED:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0";
+ case 2:
+ return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_CONSUME:
+ case MEMMODEL_ACQUIRE:
+ case MEMMODEL_SYNC_ACQUIRE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;"
+ "s_dcache_wb_vol\;s_dcache_inv_vol";
+ case 1:
+ return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
+ "buffer_wbinvl1_vol";
+ case 2:
+ return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ case MEMMODEL_RELEASE:
+ case MEMMODEL_SYNC_RELEASE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\t0";
+ case 2:
+ return "buffer_wbinvl1_vol\;"
+ "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ case MEMMODEL_SYNC_SEQ_CST:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\t0\;buffer_wbinvl1_vol";
+ case 2:
+ return "buffer_wbinvl1_vol\;"
+ "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ }
+ gcc_unreachable ();
+ }
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "20")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+;; }}}
+;; {{{ OpenACC / OpenMP
+
+(define_expand "oacc_dim_size"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")]
+ ""
+ {
+ rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
+ emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
+ DONE;
+ })
+
+(define_expand "oacc_dim_pos"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")]
+ ""
+ {
+ emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1])));
+ DONE;
+ })
+
+(define_expand "gcn_wavefront_barrier"
+ [(set (match_dup 0)
+ (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+ ""
+ {
+ operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+ MEM_VOLATILE_P (operands[0]) = 1;
+ })
+
+(define_insn "*gcn_wavefront_barrier"
+ [(set (match_operand:BLK 0 "")
+ (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+ ""
+ "s_barrier"
+ [(set_attr "type" "sopp")])
+
+(define_expand "oacc_fork"
+ [(set (match_operand:SI 0 "")
+ (match_operand:SI 1 ""))
+ (use (match_operand:SI 2 ""))]
+ ""
+ {
+ /* We need to have oacc_fork/oacc_join named patterns as a pair,
+ but the fork isn't actually used. */
+ gcc_unreachable ();
+ })
+
+(define_expand "oacc_join"
+ [(set (match_operand:SI 0 "")
+ (match_operand:SI 1 ""))
+ (use (match_operand:SI 2 ""))]
+ ""
+ {
+ emit_insn (gen_gcn_wavefront_barrier ());
+ DONE;
+ })
+
+;; }}}
+
+(include "gcn-valu.md")
new file mode 100644
@@ -0,0 +1,78 @@
+; Options for the GCN port of the compiler.
+
+; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3. If not see
+; <http://www.gnu.org/licenses/>.
+
+HeaderInclude
+config/gcn/gcn-opts.h
+
+Enum
+Name(gpu_type) Type(enum processor_type)
+GCN GPU type to use:
+
+EnumValue
+Enum(gpu_type) String(carrizo) Value(PROCESSOR_CARRIZO)
+
+EnumValue
+Enum(gpu_type) String(fiji) Value(PROCESSOR_FIJI)
+
+EnumValue
+Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA)
+
+march=
+Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_CARRIZO)
+Specify the name of the target GPU.
+
+mtune=
+Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_tune) Init(PROCESSOR_CARRIZO)
+Specify the name of the target GPU.
+
+m32
+Target Report RejectNegative InverseMask(ABI64)
+Generate code for a 32-bit ABI.
+
+m64
+Target Report RejectNegative Mask(ABI64)
+Generate code for a 64-bit ABI.
+
+mgomp
+Target Report RejectNegative
+Enable OpenMP GPU offloading.
+
+bool flag_bypass_init_error = false
+
+mbypass-init-error
+Target Report RejectNegative Var(flag_bypass_init_error)
+
+bool flag_worker_partitioning = false
+
+macc-experimental-workers
+Target Report Var(flag_worker_partitioning) Init(1)
+
+int stack_size_opt = -1
+
+mstack-size=
+Target Report RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
+-mstack-size=<number> Set the private segment size per wave-front, in bytes.
+
+mlocal-symbol-id=
+Target RejectNegative Report JoinedOrMissing Var(local_symbol_id) Init(0)
+
+Wopenacc-dims
+Target Var(warn_openacc_dims) Warning
+Warn about invalid OpenACC dimensions.
new file mode 100644
@@ -0,0 +1,697 @@
+/* Offload image generation tool for AMD GCN.
+
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* Munges GCN assembly into a C source file defining the GCN code as a
+ string.
+
+ This is not a complete assembler. We presume the source is well
+ formed from the compiler and can die horribly if it is not. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "obstack.h"
+#include "diagnostic.h"
+#include "intl.h"
+#include <libgen.h>
+#include "collect-utils.h"
+#include "gomp-constants.h"
+
+const char tool_name[] = "gcn mkoffload";
+
+#define COMMENT_PREFIX "#"
+
+struct id_map
+{
+ id_map *next;
+ char *gcn_name;
+};
+
+static id_map *func_ids, **funcs_tail = &func_ids;
+static id_map *var_ids, **vars_tail = &var_ids;
+
+/* Files to unlink. */
+static const char *gcn_s1_name;
+static const char *gcn_s2_name;
+static const char *gcn_o_name;
+static const char *gcn_cfile_name;
+
+enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
+
+/* Delete tempfiles. */
+
+void
+tool_cleanup (bool from_signal ATTRIBUTE_UNUSED)
+{
+ if (gcn_cfile_name)
+ maybe_unlink (gcn_cfile_name);
+ if (gcn_s1_name)
+ maybe_unlink (gcn_s1_name);
+ if (gcn_s2_name)
+ maybe_unlink (gcn_s2_name);
+ if (gcn_o_name)
+ maybe_unlink (gcn_o_name);
+}
+
+static void
+mkoffload_cleanup (void)
+{
+ tool_cleanup (false);
+}
+
+/* Unlink FILE unless requested otherwise. */
+
+void
+maybe_unlink (const char *file)
+{
+ if (!save_temps)
+ {
+ if (unlink_if_ordinary (file) && errno != ENOENT)
+ fatal_error (input_location, "deleting file %s: %m", file);
+ }
+ else if (verbose)
+ fprintf (stderr, "[Leaving %s]\n", file);
+}
+
+/* Add or change the value of an environment variable, outputting the
+ change to standard error if in verbose mode. */
+
+static void
+xputenv (const char *string)
+{
+ if (verbose)
+ fprintf (stderr, "%s\n", string);
+ putenv (CONST_CAST (char *, string));
+}
+
+/* Read the whole input file. It will be NUL terminated (but
+ remember, there could be a NUL in the file itself. */
+
+static const char *
+read_file (FILE *stream, size_t *plen)
+{
+ size_t alloc = 16384;
+ size_t base = 0;
+ char *buffer;
+
+ if (!fseek (stream, 0, SEEK_END))
+ {
+ /* Get the file size. */
+ long s = ftell (stream);
+ if (s >= 0)
+ alloc = s + 100;
+ fseek (stream, 0, SEEK_SET);
+ }
+ buffer = XNEWVEC (char, alloc);
+
+ for (;;)
+ {
+ size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
+
+ if (!n)
+ break;
+ base += n;
+ if (base + 1 == alloc)
+ {
+ alloc *= 2;
+ buffer = XRESIZEVEC (char, buffer, alloc);
+ }
+ }
+ buffer[base] = 0;
+ *plen = base;
+ return buffer;
+}
+
+/* Parse STR, saving found tokens into PVALUES and return their number.
+ Tokens are assumed to be delimited by ':'. */
+
+static unsigned
+parse_env_var (const char *str, char ***pvalues)
+{
+ const char *curval, *nextval;
+ char **values;
+ unsigned num = 1, i;
+
+ curval = strchr (str, ':');
+ while (curval)
+ {
+ num++;
+ curval = strchr (curval + 1, ':');
+ }
+
+ values = (char **) xmalloc (num * sizeof (char *));
+ curval = str;
+ nextval = strchr (curval, ':');
+ if (nextval == NULL)
+ nextval = strchr (curval, '\0');
+
+ for (i = 0; i < num; i++)
+ {
+ int l = nextval - curval;
+ values[i] = (char *) xmalloc (l + 1);
+ memcpy (values[i], curval, l);
+ values[i][l] = 0;
+ curval = nextval + 1;
+ nextval = strchr (curval, ':');
+ if (nextval == NULL)
+ nextval = strchr (curval, '\0');
+ }
+ *pvalues = values;
+ return num;
+}
+
+/* Auxiliary function that frees elements of PTR and PTR itself.
+ N is number of elements to be freed. If PTR is NULL, nothing is freed.
+ If an element is NULL, subsequent elements are not freed. */
+
+static void
+free_array_of_ptrs (void **ptr, unsigned n)
+{
+ unsigned i;
+ if (!ptr)
+ return;
+ for (i = 0; i < n; i++)
+ {
+ if (!ptr[i])
+ break;
+ free (ptr[i]);
+ }
+ free (ptr);
+ return;
+}
+
+/* Check whether NAME can be accessed in MODE. This is like access,
+ except that it never considers directories to be executable. */
+
+static int
+access_check (const char *name, int mode)
+{
+ if (mode == X_OK)
+ {
+ struct stat st;
+
+ if (stat (name, &st) < 0 || S_ISDIR (st.st_mode))
+ return -1;
+ }
+
+ return access (name, mode);
+}
+
+/* Parse an input assembler file, extract the offload tables etc.,
+ and output (1) the assembler code, minus the tables (which can contain
+ problematic relocations), and (2) a C file with the offload tables
+ encoded as structured data. */
+
+static void
+process_asm (FILE *in, FILE *out, FILE *cfile)
+{
+ int fn_count = 0, var_count = 0, dims_count = 0;
+ struct obstack fns_os, vars_os, varsizes_os, dims_os;
+ obstack_init (&fns_os);
+ obstack_init (&vars_os);
+ obstack_init (&varsizes_os);
+ obstack_init (&dims_os);
+
+ struct oaccdims
+ {
+ int d[3];
+ char *name;
+ } dim;
+
+ char buf[1000];
+ enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+ while (fgets (buf, sizeof (buf), in))
+ {
+ switch (state)
+ {
+ case IN_CODE:
+ {
+ if (sscanf (buf, " ;; OPENACC-DIMS: %d, %d, %d : %ms\n",
+ &dim.d[0], &dim.d[1], &dim.d[2], &dim.name) == 4)
+ {
+ obstack_grow (&dims_os, &dim, sizeof (dim));
+ dims_count++;
+ }
+ break;
+ }
+ case IN_VARS:
+ {
+ char *varname;
+ unsigned varsize;
+ if (sscanf (buf, " .8byte %ms\n", &varname))
+ {
+ obstack_ptr_grow (&vars_os, varname);
+ fgets (buf, sizeof (buf), in);
+ if (!sscanf (buf, " .8byte %u\n", &varsize))
+ abort ();
+ obstack_int_grow (&varsizes_os, varsize);
+ var_count++;
+ }
+ break;
+ }
+ case IN_FUNCS:
+ {
+ char *funcname;
+ if (sscanf (buf, "\t.8byte\t%ms\n", &funcname))
+ {
+ obstack_ptr_grow (&fns_os, funcname);
+ fn_count++;
+ continue;
+ }
+ break;
+ }
+ }
+
+ char dummy;
+ if (sscanf (buf, " .section .gnu.offload_vars%c", &dummy) > 0)
+ state = IN_VARS;
+ else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
+ state = IN_FUNCS;
+ else if (sscanf (buf, " .section %c", &dummy) > 0
+ || sscanf (buf, " .text%c", &dummy) > 0
+ || sscanf (buf, " .bss%c", &dummy) > 0
+ || sscanf (buf, " .data%c", &dummy) > 0
+ || sscanf (buf, " .ident %c", &dummy) > 0)
+ state = IN_CODE;
+
+ if (state == IN_CODE)
+ fputs (buf, out);
+ }
+
+ char **fns = XOBFINISH (&fns_os, char **);
+ struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+
+ fprintf (cfile, "#include <stdlib.h>\n");
+ fprintf (cfile, "#include <stdbool.h>\n\n");
+
+ char **vars = XOBFINISH (&vars_os, char **);
+ unsigned *varsizes = XOBFINISH (&varsizes_os, unsigned *);
+ fprintf (cfile,
+ "static const struct global_var_info {\n"
+ " const char *name;\n"
+ " void *address;\n"
+ "} vars[] = {\n");
+ int i;
+ for (i = 0; i < var_count; ++i)
+ {
+ const char *sep = i < var_count - 1 ? "," : " ";
+ fprintf (cfile, " { \"%s\", NULL }%s /* size: %u */\n", vars[i], sep,
+ varsizes[i]);
+ }
+ fprintf (cfile, "};\n\n");
+
+ obstack_free (&vars_os, NULL);
+ obstack_free (&varsizes_os, NULL);
+
+ /* Dump out function idents. */
+ fprintf (cfile, "static const struct hsa_kernel_description {\n"
+ " const char *name;\n"
+ " unsigned omp_data_size;\n"
+ " bool gridified_kernel_p;\n"
+ " unsigned kernel_dependencies_count;\n"
+ " const char **kernel_dependencies;\n"
+ " int oacc_dims[3];\n"
+ "} gcn_kernels[] = {\n ");
+ dim.d[0] = dim.d[1] = dim.d[2] = 0;
+ const char *comma;
+ for (comma = "", i = 0; i < fn_count; comma = ",\n ", i++)
+ {
+ /* Find if we recored dimensions for this function. */
+ int *d = dim.d; /* Previously zeroed. */
+ for (int j = 0; j < dims_count; j++)
+ if (strcmp (fns[i], dims[j].name) == 0)
+ {
+ d = dims[j].d;
+ break;
+ }
+
+ fprintf (cfile, "%s{\"%s\", 0, 0, 0, NULL, {%d, %d, %d}}", comma,
+ fns[i], d[0], d[1], d[2]);
+
+ free (fns[i]);
+ }
+ fprintf (cfile, "\n};\n\n");
+
+ obstack_free (&fns_os, NULL);
+ for (i = 0; i < dims_count; i++)
+ free (dims[i].name);
+ obstack_free (&dims_os, NULL);
+}
+
+/* Embed an object file into a C source file. */
+
+static void
+process_obj (FILE *in, FILE *cfile)
+{
+ size_t len = 0;
+ const char *input = read_file (in, &len);
+ id_map const *id;
+ unsigned ix;
+
+ /* Dump out an array containing the binary.
+ FIXME: do this with objcopy. */
+ fprintf (cfile, "static unsigned char gcn_code[] = {");
+ for (size_t i = 0; i < len; i += 17)
+ {
+ fprintf (cfile, "\n\t");
+ for (size_t j = i; j < i + 17 && j < len; j++)
+ fprintf (cfile, "%3u,", (unsigned char) input[j]);
+ }
+ fprintf (cfile, "\n};\n\n");
+
+ fprintf (cfile,
+ "static const struct gcn_image {\n"
+ " char magic[4];\n"
+ " size_t size;\n"
+ " void *image;\n"
+ "} gcn_image = {\n"
+ " \"GCN\",\n"
+ " %zu,\n"
+ " gcn_code\n"
+ "};\n\n",
+ len);
+
+ fprintf (cfile,
+ "static const struct brig_image_desc {\n"
+ " const struct gcn_image *gcn_image;\n"
+ " unsigned kernel_count;\n"
+ " const struct hsa_kernel_description *kernel_infos;\n"
+ " unsigned global_variable_count;\n"
+ " const struct global_var_info *global_variables;\n"
+ "} target_data = {\n"
+ " &gcn_image,\n"
+ " sizeof (gcn_kernels) / sizeof (gcn_kernels[0]),\n"
+ " gcn_kernels,\n"
+ " sizeof (vars) / sizeof (vars[0]),\n"
+ " vars\n"
+ "};\n\n");
+
+ fprintf (cfile,
+ "#ifdef __cplusplus\n"
+ "extern \"C\" {\n"
+ "#endif\n"
+ "extern void GOMP_offload_register_ver"
+ " (unsigned, const void *, int, const void *);\n"
+ "extern void GOMP_offload_unregister_ver"
+ " (unsigned, const void *, int, const void *);\n"
+ "#ifdef __cplusplus\n"
+ "}\n"
+ "#endif\n\n");
+
+ fprintf (cfile, "extern const void *const __OFFLOAD_TABLE__[];\n\n");
+
+ fprintf (cfile, "static __attribute__((constructor)) void init (void)\n"
+ "{\n"
+ " GOMP_offload_register_ver (%#x, __OFFLOAD_TABLE__,"
+ " %d/*GCN*/, &target_data);\n"
+ "};\n",
+ GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+ GOMP_DEVICE_GCN);
+
+ fprintf (cfile, "static __attribute__((destructor)) void fini (void)\n"
+ "{\n"
+ " GOMP_offload_unregister_ver (%#x, __OFFLOAD_TABLE__,"
+ " %d/*GCN*/, &target_data);\n"
+ "};\n",
+ GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+ GOMP_DEVICE_GCN);
+}
+
+/* Compile a C file using the host compiler. */
+
+static void
+compile_native (const char *infile, const char *outfile, const char *compiler)
+{
+ const char *collect_gcc_options = getenv ("COLLECT_GCC_OPTIONS");
+ if (!collect_gcc_options)
+ fatal_error (input_location,
+ "environment variable COLLECT_GCC_OPTIONS must be set");
+
+ struct obstack argv_obstack;
+ obstack_init (&argv_obstack);
+ obstack_ptr_grow (&argv_obstack, compiler);
+ if (save_temps)
+ obstack_ptr_grow (&argv_obstack, "-save-temps");
+ if (verbose)
+ obstack_ptr_grow (&argv_obstack, "-v");
+ switch (offload_abi)
+ {
+ case OFFLOAD_ABI_LP64:
+ obstack_ptr_grow (&argv_obstack, "-m64");
+ break;
+ case OFFLOAD_ABI_ILP32:
+ obstack_ptr_grow (&argv_obstack, "-m32");
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ obstack_ptr_grow (&argv_obstack, infile);
+ obstack_ptr_grow (&argv_obstack, "-c");
+ obstack_ptr_grow (&argv_obstack, "-o");
+ obstack_ptr_grow (&argv_obstack, outfile);
+ obstack_ptr_grow (&argv_obstack, NULL);
+
+ const char **new_argv = XOBFINISH (&argv_obstack, const char **);
+ fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true);
+ obstack_free (&argv_obstack, NULL);
+}
+
+int
+main (int argc, char **argv)
+{
+ FILE *in = stdin;
+ FILE *out = stdout;
+ FILE *cfile = stdout;
+ const char *outname = 0, *offloadsrc = 0;
+
+ progname = "mkoffload";
+ diagnostic_initialize (global_dc, 0);
+
+ if (atexit (mkoffload_cleanup) != 0)
+ fatal_error (input_location, "atexit failed");
+
+ char *collect_gcc = getenv ("COLLECT_GCC");
+ if (collect_gcc == NULL)
+ fatal_error (input_location, "COLLECT_GCC must be set.");
+ const char *gcc_path = dirname (ASTRDUP (collect_gcc));
+ const char *gcc_exec = basename (ASTRDUP (collect_gcc));
+
+ size_t len = (strlen (gcc_path) + 1 + strlen (GCC_INSTALL_NAME) + 1);
+ char *driver = XALLOCAVEC (char, len);
+
+ if (strcmp (gcc_exec, collect_gcc) == 0)
+ /* collect_gcc has no path, so it was found in PATH. Make sure we also
+ find accel-gcc in PATH. */
+ gcc_path = NULL;
+
+ int driver_used = 0;
+ if (gcc_path != NULL)
+ driver_used = sprintf (driver, "%s/", gcc_path);
+ sprintf (driver + driver_used, "%s", GCC_INSTALL_NAME);
+
+ bool found = false;
+ if (gcc_path == NULL)
+ found = true;
+ else if (access_check (driver, X_OK) == 0)
+ found = true;
+ else
+ {
+ /* Don't use alloca pointer with XRESIZEVEC. */
+ driver = NULL;
+ /* Look in all COMPILER_PATHs for GCC_INSTALL_NAME. */
+ char **paths = NULL;
+ unsigned n_paths;
+ n_paths = parse_env_var (getenv ("COMPILER_PATH"), &paths);
+ for (unsigned i = 0; i < n_paths; i++)
+ {
+ len = strlen (paths[i]) + 1 + strlen (GCC_INSTALL_NAME) + 1;
+ driver = XRESIZEVEC (char, driver, len);
+ sprintf (driver, "%s/%s", paths[i], GCC_INSTALL_NAME);
+ if (access_check (driver, X_OK) == 0)
+ {
+ found = true;
+ break;
+ }
+ }
+ free_array_of_ptrs ((void **) paths, n_paths);
+ }
+
+ if (!found)
+ fatal_error (input_location,
+ "offload compiler %s not found", GCC_INSTALL_NAME);
+
+ /* We may be called with all the arguments stored in some file and
+ passed with @file. Expand them into argv before processing. */
+ expandargv (&argc, &argv);
+
+ /* Scan the argument vector. */
+ bool fopenmp = false;
+ bool fopenacc = false;
+ for (int i = 1; i < argc; i++)
+ {
+#define STR "-foffload-abi="
+ if (strncmp (argv[i], STR, strlen (STR)) == 0)
+ {
+ if (strcmp (argv[i] + strlen (STR), "lp64") == 0)
+ offload_abi = OFFLOAD_ABI_LP64;
+ else if (strcmp (argv[i] + strlen (STR), "ilp32") == 0)
+ offload_abi = OFFLOAD_ABI_ILP32;
+ else
+ fatal_error (input_location,
+ "unrecognizable argument of option " STR);
+ }
+#undef STR
+ else if (strcmp (argv[i], "-fopenmp") == 0)
+ fopenmp = true;
+ else if (strcmp (argv[i], "-fopenacc") == 0)
+ fopenacc = true;
+ else if (strcmp (argv[i], "-save-temps") == 0)
+ save_temps = true;
+ else if (strcmp (argv[i], "-v") == 0)
+ verbose = true;
+ }
+ if (!(fopenacc ^ fopenmp))
+ fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
+
+ const char *abi;
+ switch (offload_abi)
+ {
+ case OFFLOAD_ABI_LP64:
+ abi = "-m64";
+ break;
+ case OFFLOAD_ABI_ILP32:
+ abi = "-m32";
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ gcn_s1_name = make_temp_file (".mkoffload.1.s");
+ gcn_s2_name = make_temp_file (".mkoffload.2.s");
+ gcn_o_name = make_temp_file (".mkoffload.hsaco");
+ gcn_cfile_name = make_temp_file (".c");
+
+ /* Build arguments for compiler pass. */
+ struct obstack cc_argv_obstack;
+ obstack_init (&cc_argv_obstack);
+ obstack_ptr_grow (&cc_argv_obstack, driver);
+ obstack_ptr_grow (&cc_argv_obstack, "-S");
+
+ if (save_temps)
+ obstack_ptr_grow (&cc_argv_obstack, "-save-temps");
+ if (verbose)
+ obstack_ptr_grow (&cc_argv_obstack, "-v");
+ obstack_ptr_grow (&cc_argv_obstack, abi);
+ obstack_ptr_grow (&cc_argv_obstack, "-xlto");
+ if (fopenmp)
+ obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
+
+ for (int ix = 1; ix != argc; ix++)
+ {
+ if (!strcmp (argv[ix], "-o") && ix + 1 != argc)
+ outname = argv[++ix];
+ else
+ {
+ obstack_ptr_grow (&cc_argv_obstack, argv[ix]);
+
+ if (argv[ix][0] != '-')
+ offloadsrc = argv[ix];
+ }
+ }
+
+ obstack_ptr_grow (&cc_argv_obstack, "-o");
+ obstack_ptr_grow (&cc_argv_obstack, gcn_s1_name);
+ obstack_ptr_grow (&cc_argv_obstack,
+ concat ("-mlocal-symbol-id=", offloadsrc, NULL));
+ obstack_ptr_grow (&cc_argv_obstack, NULL);
+ const char **cc_argv = XOBFINISH (&cc_argv_obstack, const char **);
+
+ /* FIXME: remove this hack.
+ Allow an environment override hook for debug purposes. */
+ const char *override_gcn_s2_name = getenv ("OVERRIDE_GCN_INPUT_ASM");
+
+ /* Build arguments for assemble/link pass. */
+ struct obstack ld_argv_obstack;
+ obstack_init (&ld_argv_obstack);
+ obstack_ptr_grow (&ld_argv_obstack, driver);
+ obstack_ptr_grow (&ld_argv_obstack, (override_gcn_s2_name ? : gcn_s2_name));
+ obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
+
+ for (int i = 1; i < argc; i++)
+ if (strncmp (argv[i], "-l", 2) == 0
+ || strncmp (argv[i], "-Wl", 3) == 0
+ || strncmp (argv[i], "-march", 6) == 0)
+ obstack_ptr_grow (&ld_argv_obstack, argv[i]);
+
+ obstack_ptr_grow (&ld_argv_obstack, "-o");
+ obstack_ptr_grow (&ld_argv_obstack, gcn_o_name);
+ obstack_ptr_grow (&ld_argv_obstack, NULL);
+ const char **ld_argv = XOBFINISH (&ld_argv_obstack, const char **);
+
+ /* Clean up unhelpful environment variables. */
+ char *execpath = getenv ("GCC_EXEC_PREFIX");
+ char *cpath = getenv ("COMPILER_PATH");
+ char *lpath = getenv ("LIBRARY_PATH");
+ unsetenv ("GCC_EXEC_PREFIX");
+ unsetenv ("COMPILER_PATH");
+ unsetenv ("LIBRARY_PATH");
+
+ /* Run the compiler pass. */
+ fork_execute (cc_argv[0], CONST_CAST (char **, cc_argv), true);
+ obstack_free (&cc_argv_obstack, NULL);
+
+ in = fopen (gcn_s1_name, "r");
+ if (!in)
+ fatal_error (input_location, "cannot open intermediate gcn asm file");
+
+ out = fopen (gcn_s2_name, "w");
+ if (!out)
+ fatal_error (input_location, "cannot open '%s'", gcn_s2_name);
+
+ cfile = fopen (gcn_cfile_name, "w");
+ if (!cfile)
+ fatal_error (input_location, "cannot open '%s'", gcn_cfile_name);
+
+ process_asm (in, out, cfile);
+
+ fclose (in);
+ fclose (out);
+
+ /* Run the assemble/link pass. */
+ fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true);
+ obstack_free (&ld_argv_obstack, NULL);
+
+ in = fopen (gcn_o_name, "r");
+ if (!in)
+ fatal_error (input_location, "cannot open intermediate gcn obj file");
+
+ process_obj (in, cfile);
+
+ fclose (in);
+ fclose (cfile);
+
+ xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL));
+ xputenv (concat ("COMPILER_PATH=", cpath, NULL));
+ xputenv (concat ("LIBRARY_PATH=", lpath, NULL));
+
+ compile_native (gcn_cfile_name, outname, collect_gcc);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,35 @@
+/* Support for AMD GCN offloading.
+
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_GCN_OFFLOAD_H
+#define GCC_GCN_OFFLOAD_H
+
+/* Support for OpenACC acc_on_device. */
+
+#include "gomp-constants.h"
+
+#define ACCEL_COMPILER_acc_device GOMP_DEVICE_GCN
+
+#endif
new file mode 100644
@@ -0,0 +1,189 @@
+;; Predicate definitions for GCN.
+;; Copyright (C) 2016-2017 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+;; Return true if VALUE can be stored in a sign extended immediate field.
+
+(define_predicate "gcn_conditional_register_operand"
+ (match_operand 0 "register_operand")
+{
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+
+ if (!REG_P (op))
+ return 0;
+
+ return REGNO (op) == VCCZ_REG
+ || REGNO (op) == SCC_REG
+ || REGNO (op) == EXECZ_REG
+ || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_ssrc_register_operand"
+ (match_operand 0 "register_operand")
+{
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+
+ if (!REG_P (op))
+ return false;
+
+ return SSRC_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_sdst_register_operand"
+ (match_operand 0 "register_operand")
+{
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+
+ if (!REG_P (op))
+ return false;
+
+ return SDST_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_vgpr_register_operand"
+ (match_operand 0 "register_operand")
+{
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+
+ if (!REG_P (op))
+ return false;
+
+ return VGPR_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_inline_immediate_operand"
+ (match_code "const_int,const_double,const_vector")
+{
+ return gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vop3_operand"
+ (ior (match_operand 0 "gcn_inline_immediate_operand")
+ (match_operand 0 "register_operand")))
+
+(define_predicate "gcn_vec0_operand"
+ (match_code "const_vector")
+{
+ return CONST_VECTOR_ELT (op, 0) == const0_rtx && gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vec1_operand"
+ (match_code "const_vector")
+{
+ return CONST_VECTOR_ELT (op, 0) == const1_rtx && gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vec1d_operand"
+ (match_code "const_vector")
+{
+ if (!gcn_inline_constant_p (op))
+ return false;
+
+ rtx elem = CONST_VECTOR_ELT (op, 0);
+ if (!CONST_DOUBLE_P (elem))
+ return false;
+ return real_identical (CONST_DOUBLE_REAL_VALUE (elem), &dconst1);
+})
+
+(define_predicate "gcn_const1d_operand"
+ (match_code "const_double")
+{
+ return gcn_inline_constant_p (op)
+ && real_identical (CONST_DOUBLE_REAL_VALUE (op), &dconst1);
+})
+
+(define_predicate "gcn_32bit_immediate_operand"
+ (match_code "const_int,const_double,const_vector,symbol_ref,label_ref")
+{
+ return gcn_constant_p (op);
+})
+
+; LRA works smoother when exec values are immediate constants
+; prior register allocation.
+(define_predicate "gcn_exec_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_code "const_int")))
+
+(define_predicate "gcn_exec_reg_operand"
+ (match_operand 0 "register_operand"))
+
+(define_predicate "gcn_load_operand"
+ (ior (match_operand 0 "nonimmediate_operand")
+ (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_alu_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_ds_memory_operand"
+ (and (match_code "mem")
+ (and (match_test "AS_LDS_P (MEM_ADDR_SPACE (op)) || AS_GDS_P (MEM_ADDR_SPACE (op))")
+ (match_operand 0 "memory_operand"))))
+
+(define_predicate "gcn_valu_dst_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_operand 0 "gcn_ds_memory_operand")))
+
+(define_predicate "gcn_valu_src0_operand"
+ (ior (match_operand 0 "register_operand")
+ (ior (match_operand 0 "gcn_32bit_immediate_operand")
+ (match_operand 0 "gcn_ds_memory_operand"))))
+
+(define_predicate "gcn_valu_src1_operand"
+ (match_operand 0 "register_operand"))
+
+(define_predicate "gcn_valu_src1com_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_conditional_operator"
+ (match_code "eq,ne"))
+
+(define_predicate "gcn_compare_64bit_operator"
+ (match_code "eq,ne"))
+
+(define_predicate "gcn_compare_operator"
+ (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu"))
+
+(define_predicate "gcn_fp_compare_operator"
+ (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu,ordered,unordered"))
+
+(define_predicate "unary_operator"
+ (match_code "not,popcount"))
+
+(define_predicate "binary_operator"
+ (match_code "and,ior,xor,ashift,lshiftrt,ashiftrt,smin,smax,umin,umax"))
+
+(define_predicate "gcn_register_or_unspec_operand"
+ (ior (match_operand 0 "register_operand")
+ (and (match_code "unspec")
+ (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_alu_or_unspec_operand"
+ (ior (match_operand 0 "gcn_alu_operand")
+ (and (match_code "unspec")
+ (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_register_ds_or_unspec_operand"
+ (ior (match_operand 0 "register_operand")
+ (ior (match_operand 0 "gcn_ds_memory_operand")
+ (and (match_code "unspec")
+ (match_test "XINT (op, 1) == UNSPEC_VECTOR")))))
new file mode 100644
@@ -0,0 +1,51 @@
+# Copyright (C) 2016-2018 Free Software Foundation, Inc.
+#
+# This file is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This file is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3. If not see
+# <http://www.gnu.org/licenses/>.
+
+GTM_H += $(HASH_TABLE_H)
+
+driver-gcn.o: $(srcdir)/config/gcn/driver-gcn.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
+CFLAGS-mkoffload.o += $(DRIVER_DEFINES) \
+ -DGCC_INSTALL_NAME=\"$(GCC_INSTALL_NAME)\"
+mkoffload.o: $(srcdir)/config/gcn/mkoffload.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+ALL_HOST_OBJS += mkoffload.o
+
+mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a \
+ $(LIBIBERTY) $(LIBDEPS)
+ +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
+ mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+CFLAGS-gcn-run.o += -DVERSION_STRING=$(PKGVERSION_s)
+gcn-run.o: $(srcdir)/config/gcn/gcn-run.c
+ $(COMPILE) -x c -std=gnu11 $<
+ $(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-run.o
+
+gcn-run$(exeext): gcn-run.o
+ +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
+
+MULTILIB_OPTIONS = march=gfx900
+MULTILIB_DIRNAMES = gcn5
+
+PASSES_EXTRA += $(srcdir)/config/gcn/gcn-passes.def
+gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-tree.o