@@ -572,6 +572,7 @@ case $basic_machine in
| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
| am33_2.0 \
+ | amdgcn \
| arc | arceb \
| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv6m | armv[78][arm] \
| avr | avr32 \
@@ -909,6 +910,9 @@ case $basic_machine in
fx2800)
basic_machine=i860-alliant
;;
+ amdgcn)
+ basic_machine=amdgcn-unknown
+ ;;
genix)
basic_machine=ns32k-ns
;;
@@ -1524,6 +1528,8 @@ case $os in
;;
*-eabi)
;;
+ amdhsa)
+ ;;
*)
echo Invalid configuration \`"$1"\': system \`"$os"\' not recognized 1>&2
exit 1
@@ -1548,6 +1554,9 @@ case $basic_machine in
spu-*)
os=elf
;;
+ amdgcn-*)
+ os=-amdhsa
+ ;;
*-acorn)
os=riscix1.2
;;
@@ -3569,6 +3569,8 @@ case "${target}" in
noconfigdirs="$noconfigdirs ld gas gdb gprof"
noconfigdirs="$noconfigdirs sim target-rda"
;;
+ amdgcn*-*-*)
+ ;;
arm-*-darwin*)
noconfigdirs="$noconfigdirs ld gas gdb gprof"
noconfigdirs="$noconfigdirs sim target-rda"
@@ -903,6 +903,8 @@ case "${target}" in
noconfigdirs="$noconfigdirs ld gas gdb gprof"
noconfigdirs="$noconfigdirs sim target-rda"
;;
+ amdgcn*-*-*)
+ ;;
arm-*-darwin*)
noconfigdirs="$noconfigdirs ld gas gdb gprof"
noconfigdirs="$noconfigdirs sim target-rda"
new file mode 100644
@@ -0,0 +1,38 @@
+/* Common hooks for GCN
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "common/common-target.h"
+#include "common/common-target-def.h"
+#include "opts.h"
+#include "flags.h"
+#include "params.h"
+
+/* Set default optimization options. */
+static const struct default_options gcn_option_optimization_table[] =
+ {
+ { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
+ { OPT_LEVELS_NONE, 0, NULL, 0 }
+ };
+
+#undef TARGET_OPTION_OPTIMIZATION_TABLE
+#define TARGET_OPTION_OPTIMIZATION_TABLE gcn_option_optimization_table
+
+struct gcc_targetm_common targetm_common = TARGETM_COMMON_INITIALIZER;
@@ -312,6 +312,10 @@ alpha*-*-*)
cpu_type=alpha
extra_options="${extra_options} g.opt"
;;
+amdgcn*)
+ cpu_type=gcn
+ use_gcc_stdint=wrap
+ ;;
am33_2.0-*-linux*)
cpu_type=mn10300
;;
@@ -1376,6 +1380,19 @@ ft32-*-elf)
tm_file="dbxelf.h elfos.h newlib-stdint.h ${tm_file}"
tmake_file="${tmake_file} ft32/t-ft32"
;;
+amdgcn-*-amdhsa)
+ tm_file="dbxelf.h elfos.h gcn/gcn-hsa.h gcn/gcn.h newlib-stdint.h"
+ tmake_file="gcn/t-gcn-hsa"
+ native_system_header_dir=/include
+ extra_modes=gcn/gcn-modes.def
+ extra_objs="${extra_objs} gcn-tree.o"
+ extra_gcc_objs="driver-gcn.o"
+ extra_programs="${extra_programs} gcn-run\$(exeext)"
+ if test x$enable_as_accelerator = xyes; then
+ extra_programs="${extra_programs} mkoffload\$(exeext)"
+ tm_file="${tm_file} gcn/offload.h"
+ fi
+ ;;
moxie-*-elf)
gas=yes
gnu_ld=yes
@@ -4042,6 +4059,24 @@ case "${target}" in
esac
;;
+ amdgcn-*-*)
+ supported_defaults="arch tune"
+
+ for which in arch tune; do
+ eval "val=\$with_$which"
+ case ${val} in
+ "" | carrizo | fiji | gfx900 )
+ # OK
+ ;;
+ *)
+ echo "Unknown cpu used in --with-$which=$val." 1>&2
+ exit 1
+ ;;
+ esac
+ done
+ [ "x$with_arch" = x ] && with_arch=fiji
+ ;;
+
hppa*-*-*)
supported_defaults="arch schedule"
new file mode 100644
@@ -0,0 +1,139 @@
+;; Constraint definitions for GCN.
+;; Copyright (C) 2016-2017 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_constraint "I"
+ "Inline integer constant"
+ (and (match_code "const_int")
+ (match_test "ival >= -16 && ival <= 64")))
+
+(define_constraint "J"
+ "Signed integer 16-bit inline constant"
+ (and (match_code "const_int")
+ (match_test "((unsigned HOST_WIDE_INT) ival + 0x8000) < 0x10000")))
+
+(define_constraint "Kf"
+ "Immeditate constant -1"
+ (and (match_code "const_int")
+ (match_test "ival == -1")))
+
+(define_constraint "L"
+ "Unsigned integer 15-bit constant"
+ (and (match_code "const_int")
+ (match_test "((unsigned HOST_WIDE_INT) ival) < 0x8000")))
+
+(define_constraint "A"
+ "Inline immediate parameter"
+ (and (match_code "const_int,const_double,const_vector")
+ (match_test "gcn_inline_constant_p (op)")))
+
+(define_constraint "B"
+ "Immediate 32-bit parameter"
+ (and (match_code "const_int,const_double,const_vector")
+ (match_test "gcn_constant_p (op)")))
+
+(define_constraint "C"
+ "Immediate 32-bit parameter zero-extended to 64-bits"
+ (and (match_code "const_int,const_double,const_vector")
+ (match_test "gcn_constant64_p (op)")))
+
+(define_constraint "DA"
+ "Splittable inline immediate 64-bit parameter"
+ (and (match_code "const_int,const_double,const_vector")
+ (match_test "gcn_inline_constant64_p (op)")))
+
+(define_constraint "DB"
+ "Splittable immediate 64-bit parameter"
+ (match_code "const_int,const_double,const_vector"))
+
+(define_constraint "U"
+ "unspecified value"
+ (match_code "unspec"))
+
+(define_constraint "Y"
+ "Symbol or label for relative calls"
+ (match_code "symbol_ref,label_ref"))
+
+(define_register_constraint "v" "VGPR_REGS"
+ "VGPR registers")
+
+(define_register_constraint "Sg" "SGPR_REGS"
+ "SGPR registers")
+
+(define_register_constraint "SD" "SGPR_DST_REGS"
+ "registers useable as a destination of scalar operation")
+
+(define_register_constraint "SS" "SGPR_SRC_REGS"
+ "registers useable as a source of scalar operation")
+
+(define_register_constraint "Sm" "SGPR_MEM_SRC_REGS"
+ "registers useable as a source of scalar memory operation")
+
+(define_register_constraint "Sv" "SGPR_VOP3A_SRC_REGS"
+ "registers useable as a source of VOP3A instruction")
+
+(define_register_constraint "ca" "ALL_CONDITIONAL_REGS"
+ "SCC VCCZ or EXECZ")
+
+(define_register_constraint "cs" "SCC_CONDITIONAL_REG"
+ "SCC")
+
+(define_register_constraint "cV" "VCC_CONDITIONAL_REG"
+ "VCC")
+
+(define_register_constraint "e" "EXEC_MASK_REG"
+ "EXEC")
+
+(define_special_memory_constraint "RB"
+ "Buffer memory address to scratch memory."
+ (and (match_code "mem")
+ (match_test "AS_SCRATCH_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RF"
+ "Buffer memory address to flat memory."
+ (and (match_code "mem")
+ (match_test "AS_FLAT_P (MEM_ADDR_SPACE (op))
+ && gcn_flat_address_p (XEXP (op, 0), mode)")))
+
+(define_special_memory_constraint "RS"
+ "Buffer memory address to scalar flat memory."
+ (and (match_code "mem")
+ (match_test "AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op))
+ && gcn_scalar_flat_mem_p (op)")))
+
+(define_special_memory_constraint "RL"
+ "Buffer memory address to LDS memory."
+ (and (match_code "mem")
+ (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RG"
+ "Buffer memory address to GDS memory."
+ (and (match_code "mem")
+ (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RD"
+ "Buffer memory address to GDS or LDS memory."
+ (and (match_code "mem")
+ (ior (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))")
+ (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))"))))
+
+(define_special_memory_constraint "RM"
+ "Memory address to global (main) memory."
+ (and (match_code "mem")
+ (match_test "AS_GLOBAL_P (MEM_ADDR_SPACE (op))
+ && gcn_global_address_p (XEXP (op, 0))")))
new file mode 100644
@@ -0,0 +1,32 @@
+/* Subroutines for the gcc driver.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+
+const char *
+last_arg_spec_function (int argc, const char **argv)
+{
+ if (argc == 0)
+ return NULL;
+
+ return argv[argc-1];
+}
new file mode 100644
@@ -0,0 +1,116 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* The first argument to these macros is the return type of the builtin,
+ the rest are arguments of the builtin. */
+#define _A1(a) {a, GCN_BTI_END_OF_PARAMS}
+#define _A2(a,b) {a, b, GCN_BTI_END_OF_PARAMS}
+#define _A3(a,b,c) {a, b, c, GCN_BTI_END_OF_PARAMS}
+#define _A4(a,b,c,d) {a, b, c, d, GCN_BTI_END_OF_PARAMS}
+#define _A5(a,b,c,d,e) {a, b, c, d, e, GCN_BTI_END_OF_PARAMS}
+
+DEF_BUILTIN (FLAT_LOAD_INT32, 1 /*CODE_FOR_flat_load_v64si*/,
+ "flat_load_int32", B_INSN,
+ _A3 (GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FLAT_LOAD_PTR_INT32, 2 /*CODE_FOR_flat_load_ptr_v64si */,
+ "flat_load_ptr_int32", B_INSN,
+ _A4 (GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_SIPTR, GCN_BTI_V64SI),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FLAT_STORE_PTR_INT32, 3 /*CODE_FOR_flat_store_ptr_v64si */,
+ "flat_store_ptr_int32", B_INSN,
+ _A5 (GCN_BTI_VOID, GCN_BTI_EXEC, GCN_BTI_SIPTR, GCN_BTI_V64SI,
+ GCN_BTI_V64SI),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FLAT_LOAD_PTR_FLOAT, 2 /*CODE_FOR_flat_load_ptr_v64sf */,
+ "flat_load_ptr_float", B_INSN,
+ _A4 (GCN_BTI_V64SF, GCN_BTI_EXEC, GCN_BTI_SFPTR, GCN_BTI_V64SI),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (FLAT_STORE_PTR_FLOAT, 3 /*CODE_FOR_flat_store_ptr_v64sf */,
+ "flat_store_ptr_float", B_INSN,
+ _A5 (GCN_BTI_VOID, GCN_BTI_EXEC, GCN_BTI_SFPTR, GCN_BTI_V64SI,
+ GCN_BTI_V64SF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (SQRTVF, 3 /*CODE_FOR_sqrtvf */,
+ "sqrtvf", B_INSN,
+ _A2 (GCN_BTI_V64SF, GCN_BTI_V64SF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (SQRTF, 3 /*CODE_FOR_sqrtf */,
+ "sqrtf", B_INSN,
+ _A2 (GCN_BTI_SF, GCN_BTI_SF),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (CMP_SWAP, -1,
+ "cmp_swap", B_INSN,
+ _A4 (GCN_BTI_UINT, GCN_BTI_VOIDPTR, GCN_BTI_UINT, GCN_BTI_UINT),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (CMP_SWAPLL, -1,
+ "cmp_swapll", B_INSN,
+ _A4 (GCN_BTI_LLUINT,
+ GCN_BTI_VOIDPTR, GCN_BTI_LLUINT, GCN_BTI_LLUINT),
+ gcn_expand_builtin_1)
+
+/* DEF_BUILTIN_BINOP_INT_FP creates many variants of a builtin function for a
+ given operation. The first argument will give base to the identifier of a
+ particular builtin, the second will be used to form the name of the patter
+ used to expand it to and the third will be used to create the user-visible
+ builtin identifier. */
+
+DEF_BUILTIN_BINOP_INT_FP (ADD, add, "add")
+DEF_BUILTIN_BINOP_INT_FP (SUB, sub, "sub")
+
+DEF_BUILTIN_BINOP_INT_FP (AND, and, "and")
+DEF_BUILTIN_BINOP_INT_FP (IOR, ior, "or")
+DEF_BUILTIN_BINOP_INT_FP (XOR, xor, "xor")
+
+/* OpenMP. */
+
+DEF_BUILTIN (OMP_DIM_SIZE, CODE_FOR_oacc_dim_size,
+ "dim_size", B_INSN,
+ _A2 (GCN_BTI_INT, GCN_BTI_INT),
+ gcn_expand_builtin_1)
+DEF_BUILTIN (OMP_DIM_POS, CODE_FOR_oacc_dim_pos,
+ "dim_pos", B_INSN,
+ _A2 (GCN_BTI_INT, GCN_BTI_INT),
+ gcn_expand_builtin_1)
+
+/* OpenACC. */
+
+DEF_BUILTIN (ACC_SINGLE_START, -1, "single_start", B_INSN, _A1 (GCN_BTI_BOOL),
+ gcn_expand_builtin_1)
+
+DEF_BUILTIN (ACC_SINGLE_COPY_START, -1, "single_copy_start", B_INSN,
+ _A1 (GCN_BTI_LDS_VOIDPTR), gcn_expand_builtin_1)
+
+DEF_BUILTIN (ACC_SINGLE_COPY_END, -1, "single_copy_end", B_INSN,
+ _A2 (GCN_BTI_VOID, GCN_BTI_LDS_VOIDPTR), gcn_expand_builtin_1)
+
+DEF_BUILTIN (ACC_BARRIER, -1, "acc_barrier", B_INSN, _A1 (GCN_BTI_VOID),
+ gcn_expand_builtin_1)
+
+
+#undef _A1
+#undef _A2
+#undef _A3
+#undef _A4
+#undef _A5
new file mode 100644
@@ -0,0 +1,129 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef OBJECT_FORMAT_ELF
+ #error elf.h included before elfos.h
+#endif
+
+#define TEXT_SECTION_ASM_OP "\t.section\t.text"
+#define BSS_SECTION_ASM_OP "\t.section\t.bss"
+#define GLOBAL_ASM_OP "\t.globl\t"
+#define DATA_SECTION_ASM_OP "\t.data\t"
+#define SET_ASM_OP "\t.set\t"
+#define LOCAL_LABEL_PREFIX "."
+#define USER_LABEL_PREFIX ""
+#define ASM_COMMENT_START ";"
+#define TARGET_ASM_NAMED_SECTION default_elf_asm_named_section
+
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+ asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
+
+#undef ASM_DECLARE_FUNCTION_NAME
+#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL) \
+ gcn_hsa_declare_function_name ((FILE), (NAME), (DECL))
+
+#undef ASM_OUTPUT_ALIGNED_COMMON
+#define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGNMENT) \
+ (fprintf ((FILE), "%s", COMMON_ASM_OP), \
+ assemble_name ((FILE), (NAME)), \
+ fprintf ((FILE), "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n", \
+ (SIZE) > 0 ? (SIZE) : 1, (ALIGNMENT) / BITS_PER_UNIT))
+
+#define ASM_OUTPUT_LABEL(FILE,NAME) \
+ do { assemble_name (FILE, NAME); fputs (":\n", FILE); } while (0)
+
+#define ASM_OUTPUT_LABELREF(FILE, NAME) \
+ asm_fprintf (FILE, "%U%s", default_strip_name_encoding (NAME))
+
+extern unsigned int gcn_local_sym_hash (const char *name);
+
+/* The HSA runtime puts all global and local symbols into a single per-kernel
+ variable map. In cases where we have two local static symbols with the same
+ name in different compilation units, this causes multiple definition errors.
+ To avoid this, we add a decoration to local symbol names based on a hash of
+ a "module ID" passed to the compiler via the -mlocal-symbol-id option. This
+ is far from perfect, but we expect static local variables to be rare in
+ offload code. */
+
+#define ASM_FORMAT_PRIVATE_NAME(OUTVAR, NAME, NUMBER) \
+ do { \
+ (OUTVAR) = (char *) alloca (strlen (NAME) + 30); \
+ if (local_symbol_id && *local_symbol_id) \
+ sprintf ((OUTVAR), "%s.%u.%.8x", (NAME), (NUMBER), \
+ gcn_local_sym_hash (local_symbol_id)); \
+ else \
+ sprintf ((OUTVAR), "%s.%u", (NAME), (NUMBER)); \
+ } while (0)
+
+#define ASM_OUTPUT_SYMBOL_REF(FILE, X) gcn_asm_output_symbol_ref (FILE, X)
+
+#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL) \
+ fprintf (FILE, "\t.word .L%d-.L%d\n", VALUE, REL)
+
+#define ASM_OUTPUT_ADDR_VEC_ELT(FILE, VALUE) \
+ fprintf (FILE, "\t.word .L%d\n", VALUE)
+
+#define ASM_OUTPUT_ALIGN(FILE,LOG) \
+ do { if (LOG!=0) fprintf (FILE, "\t.align\t%d\n", 1<<(LOG)); } while (0)
+#define ASM_OUTPUT_ALIGN_WITH_NOP(FILE,LOG) \
+ do { \
+ if (LOG!=0) \
+ fprintf (FILE, "\t.p2alignl\t%d, 0xBF800000" \
+ " ; Fill value is 's_nop 0'\n", (LOG)); \
+ } while (0)
+
+#define ASM_APP_ON ""
+#define ASM_APP_OFF ""
+
+/* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
+ supported for gcn. */
+#define GOMP_SELF_SPECS ""
+
+/* Use LLVM assembler and linker options. */
+#define ASM_SPEC "-triple=amdgcn--amdhsa " \
+ "%:last_arg(%{march=*:-mcpu=%*}) " \
+ "-filetype=obj"
+/* Add -mlocal-symbol-id=<source-file-basename> unless the user (or mkoffload)
+ passes the option explicitly on the command line. The option also causes
+ several dump-matching tests to fail in the testsuite, so the option is not
+ added when or tree dump/compare-debug options used in the testsuite are
+ present.
+ This has the potential for surprise, but a user can still use an explicit
+ -mlocal-symbol-id=<whatever> option manually together with -fdump-tree or
+ -fcompare-debug options. */
+#define CC1_SPEC "%{!mlocal-symbol-id=*:%{!fdump-tree-*:" \
+ "%{!fdump-ipa-*:%{!fcompare-debug*:-mlocal-symbol-id=%b}}}}"
+#define LINK_SPEC "--pie"
+#define LIB_SPEC "-lc"
+
+/* Provides a _start symbol to keep the linker happy. */
+#define STARTFILE_SPEC "crt0.o%s"
+#define ENDFILE_SPEC ""
+#define STANDARD_STARTFILE_PREFIX_2 ""
+
+/* The LLVM assembler rejects multiple -mcpu options, so we must drop
+ all but the last. */
+extern const char *last_arg_spec_function (int argc, const char **argv);
+#define EXTRA_SPEC_FUNCTIONS \
+ { "last_arg", last_arg_spec_function },
+
+#undef LOCAL_INCLUDE_DIR
+
+/* FIXME: review debug info settings */
+#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG
+#define DWARF2_DEBUGGING_INFO 1
+#define DWARF2_ASM_LINE_DEBUG_INFO 1
+#define EH_FRAME_THROUGH_COLLECT2 1
new file mode 100644
@@ -0,0 +1,45 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* Half-precision floating point */
+FLOAT_MODE (HF, 2, 0);
+/* FIXME: No idea what format it is. */
+ADJUST_FLOAT_FORMAT (HF, &ieee_half_format);
+
+/* Mask mode. Used for the autovectorizer only, and converted to DImode
+ during the expand pass. */
+VECTOR_BOOL_MODE (V64BI, 64, 8); /* V64BI */
+
+/* Native vector modes. */
+VECTOR_MODE (INT, QI, 64); /* V64QI */
+VECTOR_MODE (INT, HI, 64); /* V64HI */
+VECTOR_MODE (INT, SI, 64); /* V64SI */
+VECTOR_MODE (INT, DI, 64); /* V64DI */
+VECTOR_MODE (INT, TI, 64); /* V64TI */
+VECTOR_MODE (FLOAT, HF, 64); /* V64HF */
+VECTOR_MODE (FLOAT, SF, 64); /* V64SF */
+VECTOR_MODE (FLOAT, DF, 64); /* V64DF */
+
+/* Vector units handle reads independently and thus no large alignment
+ needed. */
+ADJUST_ALIGNMENT (V64QI, 1);
+ADJUST_ALIGNMENT (V64HI, 2);
+ADJUST_ALIGNMENT (V64SI, 4);
+ADJUST_ALIGNMENT (V64DI, 8);
+ADJUST_ALIGNMENT (V64TI, 16);
+ADJUST_ALIGNMENT (V64HF, 2);
+ADJUST_ALIGNMENT (V64SF, 4);
+ADJUST_ALIGNMENT (V64DF, 8);
new file mode 100644
@@ -0,0 +1,36 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef GCN_OPTS_H
+#define GCN_OPTS_H
+
+/* Which processor to generate code or schedule for. */
+enum processor_type
+{
+ PROCESSOR_CARRIZO,
+ PROCESSOR_FIJI,
+ PROCESSOR_VEGA
+};
+
+/* Set in gcn_option_override. */
+extern int gcn_isa;
+
+#define TARGET_GCN3 (gcn_isa == 3)
+#define TARGET_GCN3_PLUS (gcn_isa >= 3)
+#define TARGET_GCN5 (gcn_isa == 5)
+#define TARGET_GCN5_PLUS (gcn_isa >= 5)
+
+#endif
new file mode 100644
@@ -0,0 +1,19 @@
+/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+INSERT_PASS_AFTER (pass_omp_target_link, 1, pass_omp_gcn);
new file mode 100644
@@ -0,0 +1,144 @@
+/* Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+ This file is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your option)
+ any later version.
+
+ This file is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _GCN_PROTOS_
+#define _GCN_PROTOS_
+
+extern void gcn_asm_output_symbol_ref (FILE *file, rtx x);
+extern tree gcn_builtin_decl (unsigned code, bool initialize_p);
+extern bool gcn_can_split_p (machine_mode, rtx);
+extern bool gcn_constant64_p (rtx);
+extern bool gcn_constant_p (rtx);
+extern rtx gcn_convert_mask_mode (rtx reg);
+extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern void gcn_expand_epilogue ();
+extern void gcn_expand_prologue ();
+extern rtx gcn_expand_reduc_scalar (machine_mode, rtx, int);
+extern rtx gcn_expand_scalar_to_vector_address (machine_mode, rtx, rtx, rtx);
+extern void gcn_expand_vector_init (rtx, rtx);
+extern bool gcn_flat_address_p (rtx, machine_mode);
+extern bool gcn_fp_constant_p (rtx, bool);
+extern rtx gcn_full_exec ();
+extern rtx gcn_full_exec_reg ();
+extern rtx gcn_gen_undef (machine_mode);
+extern bool gcn_global_address_p (rtx);
+extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+ const char *name);
+extern void gcn_goacc_adjust_gangprivate_decl (tree var);
+extern void gcn_goacc_reduction (gcall *call);
+extern bool gcn_hard_regno_rename_ok (unsigned int from_reg,
+ unsigned int to_reg);
+extern machine_mode gcn_hard_regno_caller_save_mode (unsigned int regno,
+ unsigned int nregs,
+ machine_mode regmode);
+extern bool gcn_hard_regno_mode_ok (int regno, machine_mode mode);
+extern int gcn_hard_regno_nregs (int regno, machine_mode mode);
+extern void gcn_hsa_declare_function_name (FILE *file, const char *name,
+ tree decl);
+extern HOST_WIDE_INT gcn_initial_elimination_offset (int, int);
+extern bool gcn_inline_constant64_p (rtx);
+extern bool gcn_inline_constant_p (rtx);
+extern int gcn_inline_fp_constant_p (rtx, bool);
+extern reg_class gcn_mode_code_base_reg_class (machine_mode, addr_space_t,
+ int, int);
+extern rtx gcn_oacc_dim_pos (int dim);
+extern rtx gcn_oacc_dim_size (int dim);
+extern rtx gcn_operand_doublepart (machine_mode, rtx, int);
+extern rtx gcn_operand_part (machine_mode, rtx, int);
+extern bool gcn_regno_mode_code_ok_for_base_p (int, machine_mode,
+ addr_space_t, int, int);
+extern reg_class gcn_regno_reg_class (int regno);
+extern rtx gcn_scalar_exec ();
+extern rtx gcn_scalar_exec_reg ();
+extern bool gcn_scalar_flat_address_p (rtx);
+extern bool gcn_scalar_flat_mem_p (rtx);
+extern bool gcn_sgpr_move_p (rtx, rtx);
+extern bool gcn_valid_move_p (machine_mode, rtx, rtx);
+extern rtx gcn_vec_constant (machine_mode, int);
+extern rtx gcn_vec_constant (machine_mode, rtx);
+extern bool gcn_vgpr_move_p (rtx, rtx);
+extern void print_operand_address (FILE *file, register rtx addr);
+extern void print_operand (FILE *file, rtx x, int code);
+extern bool regno_ok_for_index_p (int);
+
+enum gcn_cvt_t
+{
+ fix_trunc_cvt,
+ fixuns_trunc_cvt,
+ float_cvt,
+ floatuns_cvt,
+ extend_cvt,
+ trunc_cvt
+};
+
+extern bool gcn_valid_cvt_p (machine_mode from, machine_mode to,
+ enum gcn_cvt_t op);
+
+#ifdef TREE_CODE
+extern void gcn_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree,
+ int);
+class gimple_opt_pass;
+extern gimple_opt_pass *make_pass_omp_gcn (gcc::context *ctxt);
+#endif
+
+/* Return true if MODE is valid for 1 VGPR register. */
+
+inline bool
+vgpr_1reg_mode_p (machine_mode mode)
+{
+ return (mode == SImode || mode == SFmode || mode == HImode || mode == QImode
+ || mode == V64QImode || mode == V64HImode || mode == V64SImode
+ || mode == V64HFmode || mode == V64SFmode || mode == BImode);
+}
+
+/* Return true if MODE is valid for 1 SGPR register. */
+
+inline bool
+sgpr_1reg_mode_p (machine_mode mode)
+{
+ return (mode == SImode || mode == SFmode || mode == HImode
+ || mode == QImode || mode == BImode);
+}
+
+/* Return true if MODE is valid for pair of VGPR registers. */
+
+inline bool
+vgpr_2reg_mode_p (machine_mode mode)
+{
+ return (mode == DImode || mode == DFmode
+ || mode == V64DImode || mode == V64DFmode);
+}
+
+/* Return true if MODE can be handled directly by VGPR operations. */
+
+inline bool
+vgpr_vector_mode_p (machine_mode mode)
+{
+ return (mode == V64QImode || mode == V64HImode
+ || mode == V64SImode || mode == V64DImode
+ || mode == V64HFmode || mode == V64SFmode || mode == V64DFmode);
+}
+
+
+/* Return true if MODE is valid for pair of SGPR registers. */
+
+inline bool
+sgpr_2reg_mode_p (machine_mode mode)
+{
+ return mode == DImode || mode == DFmode || mode == V64BImode;
+}
+
+#endif
new file mode 100644
@@ -0,0 +1,854 @@
+/* Run a stand-alone AMD GCN kernel.
+
+ Copyright 2017 Mentor Graphics Corporation
+ Copyright 2018 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* This program will run a compiled stand-alone GCN kernel on a GPU.
+
+ The kernel entry point's signature must use a standard main signature:
+
+ int main(int argc, char **argv)
+*/
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <elf.h>
+#include <signal.h>
+
+/* These probably won't be in elf.h for a while. */
+#ifndef R_AMDGPU_NONE
+#define R_AMDGPU_NONE 0
+#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
+#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
+#define R_AMDGPU_ABS64 3 /* S + A */
+#define R_AMDGPU_REL32 4 /* S + A - P */
+#define R_AMDGPU_REL64 5 /* S + A - P */
+#define R_AMDGPU_ABS32 6 /* S + A */
+#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
+#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
+#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
+#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
+#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
+#define reserved 12
+#define R_AMDGPU_RELATIVE64 13 /* B + A */
+#endif
+
+#include "hsa.h"
+
+#ifndef HSA_RUNTIME_LIB
+#define HSA_RUNTIME_LIB "libhsa-runtime64.so"
+#endif
+
+#ifndef VERSION_STRING
+#define VERSION_STRING "(version unknown)"
+#endif
+
+bool debug = false;
+
+hsa_agent_t device = { 0 };
+hsa_queue_t *queue = NULL;
+uint64_t kernel = 0;
+hsa_executable_t executable = { 0 };
+
+hsa_region_t kernargs_region = { 0 };
+uint32_t kernarg_segment_size = 0;
+uint32_t group_segment_size = 0;
+uint32_t private_segment_size = 0;
+
+static void
+usage (const char *progname)
+{
+ printf ("Usage: %s [options] kernel [kernel-args]\n\n"
+ "Options:\n"
+ " --help\n"
+ " --version\n"
+ " --debug\n", progname);
+}
+
+static void
+version (const char *progname)
+{
+ printf ("%s " VERSION_STRING "\n", progname);
+}
+
+/* As an HSA runtime is dlopened, following structure defines the necessary
+ function pointers.
+ Code adapted from libgomp. */
+
+struct hsa_runtime_fn_info
+{
+ /* HSA runtime. */
+ hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
+ const char **status_string);
+ hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
+ hsa_agent_info_t attribute,
+ void *value);
+ hsa_status_t (*hsa_init_fn) (void);
+ hsa_status_t (*hsa_iterate_agents_fn)
+ (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
+ hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
+ hsa_region_info_t attribute,
+ void *value);
+ hsa_status_t (*hsa_queue_create_fn)
+ (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
+ void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
+ void *data, uint32_t private_segment_size,
+ uint32_t group_segment_size, hsa_queue_t **queue);
+ hsa_status_t (*hsa_agent_iterate_regions_fn)
+ (hsa_agent_t agent,
+ hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
+ hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
+ hsa_status_t (*hsa_executable_create_fn)
+ (hsa_profile_t profile, hsa_executable_state_t executable_state,
+ const char *options, hsa_executable_t *executable);
+ hsa_status_t (*hsa_executable_global_variable_define_fn)
+ (hsa_executable_t executable, const char *variable_name, void *address);
+ hsa_status_t (*hsa_executable_load_code_object_fn)
+ (hsa_executable_t executable, hsa_agent_t agent,
+ hsa_code_object_t code_object, const char *options);
+ hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
+ const char *options);
+ hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
+ uint32_t num_consumers,
+ const hsa_agent_t *consumers,
+ hsa_signal_t *signal);
+ hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
+ void **ptr);
+ hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
+ size_t size);
+ hsa_status_t (*hsa_memory_free_fn) (void *ptr);
+ hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
+ hsa_status_t (*hsa_executable_get_symbol_fn)
+ (hsa_executable_t executable, const char *module_name,
+ const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
+ hsa_executable_symbol_t *symbol);
+ hsa_status_t (*hsa_executable_symbol_get_info_fn)
+ (hsa_executable_symbol_t executable_symbol,
+ hsa_executable_symbol_info_t attribute, void *value);
+ void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
+ hsa_signal_value_t value);
+ hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
+ (hsa_signal_t signal, hsa_signal_condition_t condition,
+ hsa_signal_value_t compare_value, uint64_t timeout_hint,
+ hsa_wait_state_t wait_state_hint);
+ hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
+ (hsa_signal_t signal, hsa_signal_condition_t condition,
+ hsa_signal_value_t compare_value, uint64_t timeout_hint,
+ hsa_wait_state_t wait_state_hint);
+ hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
+ hsa_status_t (*hsa_code_object_deserialize_fn)
+ (void *serialized_code_object, size_t serialized_code_object_size,
+ const char *options, hsa_code_object_t *code_object);
+ uint64_t (*hsa_queue_load_write_index_relaxed_fn)
+ (const hsa_queue_t *queue);
+ void (*hsa_queue_store_write_index_relaxed_fn)
+ (const hsa_queue_t *queue, uint64_t value);
+ hsa_status_t (*hsa_shut_down_fn) ();
+};
+
+/* HSA runtime functions that are initialized in init_hsa_context.
+ Code adapted from libgomp. */
+
+static struct hsa_runtime_fn_info hsa_fns;
+
+#define DLSYM_FN(function) \
+ hsa_fns.function##_fn = dlsym (handle, #function); \
+ if (hsa_fns.function##_fn == NULL) \
+ goto fail;
+
+static void
+init_hsa_runtime_functions (void)
+{
+ void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
+ if (handle == NULL)
+ {
+ fprintf (stderr,
+ "The HSA runtime is required to run GCN kernels on hardware.\n"
+ "%s: File not found or could not be opened\n",
+ HSA_RUNTIME_LIB);
+ exit (1);
+ }
+
+ DLSYM_FN (hsa_status_string)
+ DLSYM_FN (hsa_agent_get_info)
+ DLSYM_FN (hsa_init)
+ DLSYM_FN (hsa_iterate_agents)
+ DLSYM_FN (hsa_region_get_info)
+ DLSYM_FN (hsa_queue_create)
+ DLSYM_FN (hsa_agent_iterate_regions)
+ DLSYM_FN (hsa_executable_destroy)
+ DLSYM_FN (hsa_executable_create)
+ DLSYM_FN (hsa_executable_global_variable_define)
+ DLSYM_FN (hsa_executable_load_code_object)
+ DLSYM_FN (hsa_executable_freeze)
+ DLSYM_FN (hsa_signal_create)
+ DLSYM_FN (hsa_memory_allocate)
+ DLSYM_FN (hsa_memory_copy)
+ DLSYM_FN (hsa_memory_free)
+ DLSYM_FN (hsa_signal_destroy)
+ DLSYM_FN (hsa_executable_get_symbol)
+ DLSYM_FN (hsa_executable_symbol_get_info)
+ DLSYM_FN (hsa_signal_wait_acquire)
+ DLSYM_FN (hsa_signal_wait_relaxed)
+ DLSYM_FN (hsa_signal_store_relaxed)
+ DLSYM_FN (hsa_queue_destroy)
+ DLSYM_FN (hsa_code_object_deserialize)
+ DLSYM_FN (hsa_queue_load_write_index_relaxed)
+ DLSYM_FN (hsa_queue_store_write_index_relaxed)
+ DLSYM_FN (hsa_shut_down)
+
+ return;
+
+fail:
+ fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
+ exit (1);
+}
+
+#undef DLSYM_FN
+
+/* Report a fatal error STR together with the HSA error corresponding to
+ STATUS and terminate execution of the current process. */
+
+static void
+hsa_fatal (const char *str, hsa_status_t status)
+{
+ const char *hsa_error_msg;
+ hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+ fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
+ hsa_error_msg);
+ exit (1);
+}
+
+/* Helper macros to ensure we check the return values from the HSA Runtime.
+ These just keep the rest of the code a bit cleaner. */
+
+#define XHSA_CMP(FN, CMP, MSG) \
+ do { \
+ hsa_status_t status = (FN); \
+ if (!(CMP)) \
+ hsa_fatal ((MSG), status); \
+ else if (debug) \
+ fprintf (stderr, "%s: OK\n", (MSG)); \
+ } while (0)
+#define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
+
+/* Callback of hsa_iterate_agents.
+ Called once for each available device, and returns "break" when a
+ suitable one has been found. */
+
+static hsa_status_t
+get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
+{
+ hsa_device_type_t device_type;
+ XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+ &device_type),
+ "Get agent type");
+
+ /* Select only GPU devices. */
+ /* TODO: support selecting from multiple GPUs. */
+ if (HSA_DEVICE_TYPE_GPU == device_type)
+ {
+ device = agent;
+ return HSA_STATUS_INFO_BREAK;
+ }
+
+ /* The device was not suitable. */
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Callback of hsa_iterate_regions.
+ Called once for each available memory region, and returns "break" when a
+ suitable one has been found. */
+
+static hsa_status_t
+get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused)))
+{
+ /* Reject non-global regions. */
+ hsa_region_segment_t segment;
+ hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
+ if (HSA_REGION_SEGMENT_GLOBAL != segment)
+ return HSA_STATUS_SUCCESS;
+
+ /* Find a region with the KERNARG flag set. */
+ hsa_region_global_flag_t flags;
+ hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+ &flags);
+ if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+ {
+ kernargs_region = region;
+ return HSA_STATUS_INFO_BREAK;
+ }
+
+ /* The region was not suitable. */
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Initialize the HSA Runtime library and GPU device. */
+
+static void
+init_device ()
+{
+ /* Load the shared library and find the API functions. */
+ init_hsa_runtime_functions ();
+
+ /* Initialize the HSA Runtime. */
+ XHSA (hsa_fns.hsa_init_fn (),
+ "Initialize run-time");
+
+ /* Select a suitable device.
+ The call-back function, get_gpu_agent, does the selection. */
+ XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
+ status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
+ "Find a device");
+
+ /* Initialize the queue used for launching kernels. */
+ uint32_t queue_size = 0;
+ XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+ &queue_size),
+ "Find max queue size");
+ XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
+ HSA_QUEUE_TYPE_SINGLE, NULL,
+ NULL, UINT32_MAX, UINT32_MAX, &queue),
+ "Set up a device queue");
+
+ /* Select a memory region for the kernel arguments.
+ The call-back function, get_kernarg_region, does the selection. */
+ XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
+ NULL),
+ status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
+ "Locate kernargs memory");
+}
+
+
+/* Read a whole input file.
+ Code copied from mkoffload. */
+
+static char *
+read_file (const char *filename, size_t *plen)
+{
+ size_t alloc = 16384;
+ size_t base = 0;
+ char *buffer;
+
+ FILE *stream = fopen (filename, "rb");
+ if (!stream)
+ {
+ perror (filename);
+ exit (1);
+ }
+
+ if (!fseek (stream, 0, SEEK_END))
+ {
+ /* Get the file size. */
+ long s = ftell (stream);
+ if (s >= 0)
+ alloc = s + 100;
+ fseek (stream, 0, SEEK_SET);
+ }
+ buffer = malloc (alloc);
+
+ for (;;)
+ {
+ size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
+
+ if (!n)
+ break;
+ base += n;
+ if (base + 1 == alloc)
+ {
+ alloc *= 2;
+ buffer = realloc (buffer, alloc);
+ }
+ }
+ buffer[base] = 0;
+ *plen = base;
+
+ fclose (stream);
+
+ return buffer;
+}
+
+/* Read a HSA Code Object (HSACO) from file, and load it into the device. */
+
+static void
+load_image (const char *filename)
+{
+ size_t image_size;
+ Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
+
+ /* An "executable" consists of one or more code objects. */
+ XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
+ HSA_EXECUTABLE_STATE_UNFROZEN, "",
+ &executable),
+ "Initialize GCN executable");
+
+ /* Hide relocations from the HSA runtime loader.
+ Keep a copy of the unmodified section headers to use later. */
+ Elf64_Shdr *image_sections =
+ (Elf64_Shdr *) ((char *) image + image->e_shoff);
+ Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
+ memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
+ for (int i = image->e_shnum - 1; i >= 0; i--)
+ {
+ if (image_sections[i].sh_type == SHT_RELA
+ || image_sections[i].sh_type == SHT_REL)
+ /* Change section type to something harmless. */
+ image_sections[i].sh_type = SHT_NOTE;
+ }
+
+ /* Add the HSACO to the executable. */
+ hsa_code_object_t co = { 0 };
+ XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
+ "Deserialize GCN code object");
+ XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
+ ""),
+ "Load GCN code object");
+
+ /* We're done modifying he executable. */
+ XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
+ "Freeze GCN executable");
+
+ /* Locate the "main" function, and read the kernel's properties. */
+ hsa_executable_symbol_t symbol;
+ XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
+ device, 0, &symbol),
+ "Find 'main' function");
+ XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel),
+ "Extract kernel object");
+ XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+ &kernarg_segment_size),
+ "Extract kernarg segment size");
+ XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+ &group_segment_size),
+ "Extract group segment size");
+ XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+ &private_segment_size),
+ "Extract private segment size");
+
+ /* Find main function in ELF, and calculate actual load offset. */
+ Elf64_Addr load_offset;
+ XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+ (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+ &load_offset),
+ "Extract 'main' symbol address");
+ for (int i = 0; i < image->e_shnum; i++)
+ if (sections[i].sh_type == SHT_SYMTAB)
+ {
+ Elf64_Shdr *strtab = §ions[sections[i].sh_link];
+ char *strings = (char *) image + strtab->sh_offset;
+
+ for (size_t offset = 0;
+ offset < sections[i].sh_size;
+ offset += sections[i].sh_entsize)
+ {
+ Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
+ + sections[i].sh_offset + offset);
+ if (strcmp ("main", strings + sym->st_name) == 0)
+ {
+ load_offset -= sym->st_value;
+ goto found_main;
+ }
+ }
+ }
+ /* We only get here when main was not found.
+ This should never happen. */
+ fprintf (stderr, "Error: main function not found.\n");
+ abort ();
+found_main:;
+
+ /* Find dynamic symbol table. */
+ Elf64_Shdr *dynsym = NULL;
+ for (int i = 0; i < image->e_shnum; i++)
+ if (sections[i].sh_type == SHT_DYNSYM)
+ {
+ dynsym = §ions[i];
+ break;
+ }
+
+ /* Fix up relocations. */
+ for (int i = 0; i < image->e_shnum; i++)
+ {
+ if (sections[i].sh_type == SHT_RELA)
+ for (size_t offset = 0;
+ offset < sections[i].sh_size;
+ offset += sections[i].sh_entsize)
+ {
+ Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
+ + sections[i].sh_offset
+ + offset);
+ Elf64_Sym *sym =
+ (dynsym
+ ? (Elf64_Sym *) ((char *) image
+ + dynsym->sh_offset
+ + (dynsym->sh_entsize
+ * ELF64_R_SYM (reloc->r_info))) : NULL);
+
+ int64_t S = (sym ? sym->st_value : 0);
+ int64_t P = reloc->r_offset + load_offset;
+ int64_t A = reloc->r_addend;
+ int64_t B = load_offset;
+ int64_t V, size;
+ switch (ELF64_R_TYPE (reloc->r_info))
+ {
+ case R_AMDGPU_ABS32_LO:
+ V = (S + A) & 0xFFFFFFFF;
+ size = 4;
+ break;
+ case R_AMDGPU_ABS32_HI:
+ V = (S + A) >> 32;
+ size = 4;
+ break;
+ case R_AMDGPU_ABS64:
+ V = S + A;
+ size = 8;
+ break;
+ case R_AMDGPU_REL32:
+ V = S + A - P;
+ size = 4;
+ break;
+ case R_AMDGPU_REL64:
+ /* FIXME
+ LLD seems to emit REL64 where the the assembler has ABS64.
+ This is clearly wrong because it's not what the compiler
+ is expecting. Let's assume, for now, that it's a bug.
+ In any case, GCN kernels are always self contained and
+ therefore relative relocations will have been resolved
+ already, so this should be a safe workaround. */
+ V = S + A /* - P */ ;
+ size = 8;
+ break;
+ case R_AMDGPU_ABS32:
+ V = S + A;
+ size = 4;
+ break;
+ /* TODO R_AMDGPU_GOTPCREL */
+ /* TODO R_AMDGPU_GOTPCREL32_LO */
+ /* TODO R_AMDGPU_GOTPCREL32_HI */
+ case R_AMDGPU_REL32_LO:
+ V = (S + A - P) & 0xFFFFFFFF;
+ size = 4;
+ break;
+ case R_AMDGPU_REL32_HI:
+ V = (S + A - P) >> 32;
+ size = 4;
+ break;
+ case R_AMDGPU_RELATIVE64:
+ V = B + A;
+ size = 8;
+ break;
+ default:
+ fprintf (stderr, "Error: unsupported relocation type.\n");
+ exit (1);
+ }
+ XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
+ "Fix up relocation");
+ }
+ }
+}
+
+/* Allocate some device memory from the kernargs region.
+ The returned address will be 32-bit (with excess zeroed on 64-bit host),
+ and accessible via the same address on both host and target (via
+ __flat_scalar GCN address space). */
+
+static void *
+device_malloc (size_t size)
+{
+ void *result;
+ XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result),
+ "Allocate device memory");
+ return result;
+}
+
+/* These are the device pointers that will be transferred to the target.
+ The HSA Runtime points the kernargs register here.
+ They correspond to function signature:
+ int main (int argc, char *argv[], int *return_value)
+ The compiler expects this, for kernel functions, and will
+ automatically assign the exit value to *return_value. */
+struct kernargs
+{
+ /* Kernargs. */
+ int32_t argc;
+ int64_t argv;
+ int64_t out_ptr;
+ int64_t heap_ptr;
+
+ /* Output data. */
+ struct output
+ {
+ int return_value;
+ int next_output;
+ struct printf_data
+ {
+ int written;
+ char msg[128];
+ int type;
+ union
+ {
+ int64_t ivalue;
+ double dvalue;
+ char text[128];
+ };
+ } queue[1000];
+ } output_data;
+
+ struct heap
+ {
+ int64_t size;
+ char data[0];
+ } heap;
+};
+
+/* Print any console output from the kernel.
+ We print all entries from print_index to the next entry without a "written"
+ flag. Subsequent calls should use the returned print_index value to resume
+ from the same point. */
+void
+gomp_print_output (struct kernargs *kernargs, int *print_index)
+{
+ static bool warned_p = false;
+
+ int limit = (sizeof (kernargs->output_data.queue)
+ / sizeof (kernargs->output_data.queue[0]));
+
+ int i;
+ for (i = *print_index; i < limit; i++)
+ {
+ struct printf_data *data = &kernargs->output_data.queue[i];
+
+ if (!data->written)
+ break;
+
+ switch (data->type)
+ {
+ case 0:
+ printf ("%.128s%ld\n", data->msg, data->ivalue);
+ break;
+ case 1:
+ printf ("%.128s%f\n", data->msg, data->dvalue);
+ break;
+ case 2:
+ printf ("%.128s%.128s\n", data->msg, data->text);
+ break;
+ case 3:
+ printf ("%.128s%.128s", data->msg, data->text);
+ break;
+ }
+
+ data->written = 0;
+ }
+
+ if (kernargs->output_data.next_output > limit && !warned_p)
+ {
+ printf ("WARNING: GCN print buffer exhausted.\n");
+ warned_p = true;
+ }
+
+ *print_index = i;
+}
+
+/* Execute an already-loaded kernel on the device. */
+
+static void
+run (void *kernargs)
+{
+ /* A "signal" is used to launch and monitor the kernel. */
+ hsa_signal_t signal;
+ XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
+ "Create signal");
+
+ /* Configure for a single-worker kernel. */
+ uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
+ const uint32_t queueMask = queue->size - 1;
+ hsa_kernel_dispatch_packet_t *dispatch_packet =
+ &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
+ queueMask]);
+ dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ dispatch_packet->workgroup_size_x = (uint16_t) 1;
+ dispatch_packet->workgroup_size_y = (uint16_t) 64;
+ dispatch_packet->workgroup_size_z = (uint16_t) 1;
+ dispatch_packet->grid_size_x = 1;
+ dispatch_packet->grid_size_y = 64;
+ dispatch_packet->grid_size_z = 1;
+ dispatch_packet->completion_signal = signal;
+ dispatch_packet->kernel_object = kernel;
+ dispatch_packet->kernarg_address = (void *) kernargs;
+ dispatch_packet->private_segment_size = private_segment_size;
+ dispatch_packet->group_segment_size = group_segment_size;
+
+ uint16_t header = 0;
+ header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+ header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+ header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+
+ __atomic_store_n ((uint32_t *) dispatch_packet,
+ header | (dispatch_packet->setup << 16),
+ __ATOMIC_RELEASE);
+
+ if (debug)
+ fprintf (stderr, "Launch kernel\n");
+
+ hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
+ hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
+ /* Kernel running ...... */
+ int print_index = 0;
+ while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
+ 1, 1000000,
+ HSA_WAIT_STATE_ACTIVE) != 0)
+ {
+ usleep (10000);
+ gomp_print_output (kernargs, &print_index);
+ }
+
+ gomp_print_output (kernargs, &print_index);
+
+ if (debug)
+ fprintf (stderr, "Kernel exited\n");
+
+ XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
+ "Clean up signal");
+}
+
+int
+main (int argc, char *argv[])
+{
+ int kernel_arg = 0;
+ for (int i = 1; i < argc; i++)
+ {
+ if (!strcmp (argv[i], "--help"))
+ {
+ usage (argv[0]);
+ return 0;
+ }
+ else if (!strcmp (argv[i], "--version"))
+ {
+ version (argv[0]);
+ return 0;
+ }
+ else if (!strcmp (argv[i], "--debug"))
+ debug = true;
+ else if (argv[i][0] == '-')
+ {
+ usage (argv[0]);
+ return 1;
+ }
+ else
+ {
+ kernel_arg = i;
+ break;
+ }
+ }
+
+ if (!kernel_arg)
+ {
+ /* No kernel arguments were found. */
+ usage (argv[0]);
+ return 1;
+ }
+
+ /* The remaining arguments are for the GCN kernel. */
+ int kernel_argc = argc - kernel_arg;
+ char **kernel_argv = &argv[kernel_arg];
+
+ init_device ();
+ load_image (kernel_argv[0]);
+
+ /* Calculate size of function parameters + argv data. */
+ size_t args_size = 0;
+ for (int i = 0; i < kernel_argc; i++)
+ args_size += strlen (kernel_argv[i]) + 1;
+
+ /* Allocate device memory for both function parameters and the argv
+ data. */
+ size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
+ struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size);
+ struct argdata
+ {
+ int64_t argv_data[kernel_argc];
+ char strings[args_size];
+ } *args = device_malloc (sizeof (struct argdata));
+
+ /* Write the data to the target. */
+ kernargs->argc = kernel_argc;
+ kernargs->argv = (int64_t) args->argv_data;
+ kernargs->out_ptr = (int64_t) &kernargs->output_data;
+ kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
+ kernargs->output_data.next_output = 0;
+ for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
+ / sizeof (kernargs->output_data.queue[0])); i++)
+ kernargs->output_data.queue[i].written = 0;
+ int offset = 0;
+ for (int i = 0; i < kernel_argc; i++)
+ {
+ size_t arg_len = strlen (kernel_argv[i]) + 1;
+ args->argv_data[i] = (int64_t) &args->strings[offset];
+ memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
+ offset += arg_len;
+ }
+ kernargs->heap_ptr = (int64_t) &kernargs->heap;
+ kernargs->heap.size = heap_size;
+
+ /* Run the kernel on the GPU. */
+ run (kernargs);
+ unsigned int return_value =
+ (unsigned int) kernargs->output_data.return_value;
+
+ unsigned int upper = (return_value & ~0xffff) >> 16;
+ if (upper == 0xcafe)
+ printf ("Kernel exit value was never set\n");
+ else if (upper == 0xffff)
+ ; /* Set by exit. */
+ else if (upper == 0)
+ ; /* Set by return from main. */
+ else
+ printf ("Possible kernel exit value corruption, 2 most significant bytes "
+ "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
+
+ if (upper == 0xffff)
+ {
+ unsigned int signal = (return_value >> 8) & 0xff;
+ if (signal == SIGABRT)
+ printf ("Kernel aborted\n");
+ else if (signal != 0)
+ printf ("Kernel received unkown signal\n");
+ }
+
+ if (debug)
+ printf ("Kernel exit value: %d\n", return_value & 0xff);
+
+ /* Clean shut down. */
+ XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
+ "Clean up device memory");
+ XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
+ "Clean up GCN executable");
+ XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
+ "Clean up device queue");
+ XHSA (hsa_fns.hsa_shut_down_fn (),
+ "Shut down run-time");
+
+ return return_value & 0xff;
+}
new file mode 100644
@@ -0,0 +1,715 @@
+/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* {{{ Includes. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "fold-const.h"
+#include "varasm.h"
+#include "omp-low.h"
+#include "omp-general.h"
+#include "internal-fn.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-operands.h"
+#include "gimplify.h"
+#include "tree-phinodes.h"
+#include "cgraph.h"
+#include "targhooks.h"
+#include "langhooks-def.h"
+
+/* }}} */
+/* {{{ OMP GCN pass. */
+
+unsigned int
+execute_omp_gcn (void)
+{
+ tree thr_num_tree = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
+ tree thr_num_id = DECL_NAME (thr_num_tree);
+ tree team_num_tree = builtin_decl_explicit (BUILT_IN_OMP_GET_TEAM_NUM);
+ tree team_num_id = DECL_NAME (team_num_tree);
+ basic_block bb;
+ gimple_stmt_iterator gsi;
+ unsigned int todo = 0;
+
+ FOR_EACH_BB_FN (bb, cfun)
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *call = gsi_stmt (gsi);
+ tree decl;
+
+ if (is_gimple_call (call) && (decl = gimple_call_fndecl (call)))
+ {
+ tree decl_id = DECL_NAME (decl);
+ tree lhs = gimple_get_lhs (call);
+
+ if (decl_id == thr_num_id)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Replace '%s' with __builtin_gcn_dim_pos.\n",
+ IDENTIFIER_POINTER (decl_id));
+
+ /* Transform this:
+ lhs = __builtin_omp_get_thread_num ()
+ to this:
+ lhs = __builtin_gcn_dim_pos (1) */
+ tree fn = targetm.builtin_decl (GCN_BUILTIN_OMP_DIM_POS, 0);
+ tree fnarg = build_int_cst (unsigned_type_node, 1);
+ gimple *stmt = gimple_build_call (fn, 1, fnarg);
+ gimple_call_set_lhs (stmt, lhs);
+ gsi_replace (&gsi, stmt, true);
+
+ todo |= TODO_update_ssa;
+ }
+ else if (decl_id == team_num_id)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Replace '%s' with __builtin_gcn_dim_pos.\n",
+ IDENTIFIER_POINTER (decl_id));
+
+ /* Transform this:
+ lhs = __builtin_omp_get_team_num ()
+ to this:
+ lhs = __builtin_gcn_dim_pos (0) */
+ tree fn = targetm.builtin_decl (GCN_BUILTIN_OMP_DIM_POS, 0);
+ tree fnarg = build_zero_cst (unsigned_type_node);
+ gimple *stmt = gimple_build_call (fn, 1, fnarg);
+ gimple_call_set_lhs (stmt, lhs);
+ gsi_replace (&gsi, stmt, true);
+
+ todo |= TODO_update_ssa;
+ }
+ }
+ }
+
+ return todo;
+}
+
+namespace
+{
+
+ const pass_data pass_data_omp_gcn = {
+ GIMPLE_PASS,
+ "omp_gcn", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+ };
+
+ class pass_omp_gcn : public gimple_opt_pass
+ {
+ public:
+ pass_omp_gcn (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_omp_gcn, ctxt)
+ {
+ }
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return flag_openmp;
+ }
+
+ virtual unsigned int execute (function *)
+ {
+ return execute_omp_gcn ();
+ }
+
+ }; /* class pass_omp_gcn. */
+
+} /* anon namespace. */
+
+gimple_opt_pass *
+make_pass_omp_gcn (gcc::context *ctxt)
+{
+ return new pass_omp_gcn (ctxt);
+}
+
+/* }}} */
+/* {{{ OpenACC reductions. */
+
+/* Global lock variable, needed for 128bit worker & gang reductions. */
+
+static GTY(()) tree global_lock_var;
+
+/* Lazily generate the global_lock_var decl and return its address. */
+
+static tree
+gcn_global_lock_addr ()
+{
+ tree v = global_lock_var;
+
+ if (!v)
+ {
+ tree name = get_identifier ("__reduction_lock");
+ tree type = build_qualified_type (unsigned_type_node,
+ TYPE_QUAL_VOLATILE);
+ v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
+ global_lock_var = v;
+ DECL_ARTIFICIAL (v) = 1;
+ DECL_EXTERNAL (v) = 1;
+ TREE_STATIC (v) = 1;
+ TREE_PUBLIC (v) = 1;
+ TREE_USED (v) = 1;
+ mark_addressable (v);
+ mark_decl_referenced (v);
+ }
+
+ return build_fold_addr_expr (v);
+}
+
+/* Helper function for gcn_reduction_update.
+
+ Insert code to locklessly update *PTR with *PTR OP VAR just before
+ GSI. We use a lockless scheme for nearly all case, which looks
+ like:
+ actual = initval (OP);
+ do {
+ guess = actual;
+ write = guess OP myval;
+ actual = cmp&swap (ptr, guess, write)
+ } while (actual bit-different-to guess);
+ return write;
+
+ This relies on a cmp&swap instruction, which is available for 32- and
+ 64-bit types. Larger types must use a locking scheme. */
+
+static tree
+gcn_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
+ tree ptr, tree var, tree_code op)
+{
+ unsigned fn = GCN_BUILTIN_CMP_SWAP;
+ tree_code code = NOP_EXPR;
+ tree arg_type = unsigned_type_node;
+ tree var_type = TREE_TYPE (var);
+
+ if (TREE_CODE (var_type) == COMPLEX_TYPE
+ || TREE_CODE (var_type) == REAL_TYPE)
+ code = VIEW_CONVERT_EXPR;
+
+ if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
+ {
+ arg_type = long_long_unsigned_type_node;
+ fn = GCN_BUILTIN_CMP_SWAPLL;
+ }
+
+ tree swap_fn = gcn_builtin_decl (fn, true);
+
+ gimple_seq init_seq = NULL;
+ tree init_var = make_ssa_name (arg_type);
+ tree init_expr = omp_reduction_init_op (loc, op, var_type);
+ init_expr = fold_build1 (code, arg_type, init_expr);
+ gimplify_assign (init_var, init_expr, &init_seq);
+ gimple *init_end = gimple_seq_last (init_seq);
+
+ gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
+
+ /* Split the block just after the init stmts. */
+ basic_block pre_bb = gsi_bb (*gsi);
+ edge pre_edge = split_block (pre_bb, init_end);
+ basic_block loop_bb = pre_edge->dest;
+ pre_bb = pre_edge->src;
+ /* Reset the iterator. */
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ tree expect_var = make_ssa_name (arg_type);
+ tree actual_var = make_ssa_name (arg_type);
+ tree write_var = make_ssa_name (arg_type);
+
+ /* Build and insert the reduction calculation. */
+ gimple_seq red_seq = NULL;
+ tree write_expr = fold_build1 (code, var_type, expect_var);
+ write_expr = fold_build2 (op, var_type, write_expr, var);
+ write_expr = fold_build1 (code, arg_type, write_expr);
+ gimplify_assign (write_var, write_expr, &red_seq);
+
+ gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
+
+ /* Build & insert the cmp&swap sequence. */
+ gimple_seq latch_seq = NULL;
+ tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
+ ptr, expect_var, write_var);
+ gimplify_assign (actual_var, swap_expr, &latch_seq);
+
+ gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
+ NULL_TREE, NULL_TREE);
+ gimple_seq_add_stmt (&latch_seq, cond);
+
+ gimple *latch_end = gimple_seq_last (latch_seq);
+ gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
+
+ /* Split the block just after the latch stmts. */
+ edge post_edge = split_block (loop_bb, latch_end);
+ basic_block post_bb = post_edge->dest;
+ loop_bb = post_edge->src;
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
+ /* post_edge->probability = profile_probability::even (); */
+ edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
+ /* loop_edge->probability = profile_probability::even (); */
+ set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
+ set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
+
+ gphi *phi = create_phi_node (expect_var, loop_bb);
+ add_phi_arg (phi, init_var, pre_edge, loc);
+ add_phi_arg (phi, actual_var, loop_edge, loc);
+
+ loop *loop = alloc_loop ();
+ loop->header = loop_bb;
+ loop->latch = loop_bb;
+ add_loop (loop, loop_bb->loop_father);
+
+ return fold_build1 (code, var_type, write_var);
+}
+
+/* Helper function for gcn_reduction_update.
+
+ Insert code to lockfully update *PTR with *PTR OP VAR just before
+ GSI. This is necessary for types larger than 64 bits, where there
+ is no cmp&swap instruction to implement a lockless scheme. We use
+ a lock variable in global memory.
+
+ while (cmp&swap (&lock_var, 0, 1))
+ continue;
+ T accum = *ptr;
+ accum = accum OP var;
+ *ptr = accum;
+ cmp&swap (&lock_var, 1, 0);
+ return accum;
+
+ A lock in global memory is necessary to force execution engine
+ descheduling and avoid resource starvation that can occur if the
+ lock is in shared memory. */
+
+static tree
+gcn_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
+ tree ptr, tree var, tree_code op)
+{
+ tree var_type = TREE_TYPE (var);
+ tree swap_fn = gcn_builtin_decl (GCN_BUILTIN_CMP_SWAP, true);
+ tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
+ tree uns_locked = build_int_cst (unsigned_type_node, 1);
+
+ /* Split the block just before the gsi. Insert a gimple nop to make
+ this easier. */
+ gimple *nop = gimple_build_nop ();
+ gsi_insert_before (gsi, nop, GSI_SAME_STMT);
+ basic_block entry_bb = gsi_bb (*gsi);
+ edge entry_edge = split_block (entry_bb, nop);
+ basic_block lock_bb = entry_edge->dest;
+ /* Reset the iterator. */
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ /* Build and insert the locking sequence. */
+ gimple_seq lock_seq = NULL;
+ tree lock_var = make_ssa_name (unsigned_type_node);
+ tree lock_expr = gcn_global_lock_addr ();
+ lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
+ uns_unlocked, uns_locked);
+ gimplify_assign (lock_var, lock_expr, &lock_seq);
+ gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
+ NULL_TREE, NULL_TREE);
+ gimple_seq_add_stmt (&lock_seq, cond);
+ gimple *lock_end = gimple_seq_last (lock_seq);
+ gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
+
+ /* Split the block just after the lock sequence. */
+ edge locked_edge = split_block (lock_bb, lock_end);
+ basic_block update_bb = locked_edge->dest;
+ lock_bb = locked_edge->src;
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ /* Create the lock loop. */
+ locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
+ locked_edge->probability = profile_probability::even ();
+ edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
+ loop_edge->probability = profile_probability::even ();
+ set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
+ set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
+
+ /* Create the loop structure. */
+ loop *lock_loop = alloc_loop ();
+ lock_loop->header = lock_bb;
+ lock_loop->latch = lock_bb;
+ lock_loop->nb_iterations_estimate = 1;
+ lock_loop->any_estimate = true;
+ add_loop (lock_loop, entry_bb->loop_father);
+
+ /* Build and insert the reduction calculation. */
+ gimple_seq red_seq = NULL;
+ tree acc_in = make_ssa_name (var_type);
+ tree ref_in = build_simple_mem_ref (ptr);
+ TREE_THIS_VOLATILE (ref_in) = 1;
+ gimplify_assign (acc_in, ref_in, &red_seq);
+
+ tree acc_out = make_ssa_name (var_type);
+ tree update_expr = fold_build2 (op, var_type, ref_in, var);
+ gimplify_assign (acc_out, update_expr, &red_seq);
+
+ tree ref_out = build_simple_mem_ref (ptr);
+ TREE_THIS_VOLATILE (ref_out) = 1;
+ gimplify_assign (ref_out, acc_out, &red_seq);
+
+ gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
+
+ /* Build & insert the unlock sequence. */
+ gimple_seq unlock_seq = NULL;
+ tree unlock_expr = gcn_global_lock_addr ();
+ unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
+ uns_locked, uns_unlocked);
+ gimplify_and_add (unlock_expr, &unlock_seq);
+ gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
+
+ return acc_out;
+}
+
+/* Emit a sequence to update a reduction accumulator at *PTR with the
+ value held in VAR using operator OP. Return the updated value.
+
+ TODO: optimize for atomic ops and independent complex ops. */
+
+static tree
+gcn_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
+ tree ptr, tree var, tree_code op)
+{
+ tree type = TREE_TYPE (var);
+ tree size = TYPE_SIZE (type);
+
+ if (size == TYPE_SIZE (unsigned_type_node)
+ || size == TYPE_SIZE (long_long_unsigned_type_node))
+ return gcn_lockless_update (loc, gsi, ptr, var, op);
+ else
+ return gcn_lockfull_update (loc, gsi, ptr, var, op);
+}
+
+/* Return a temporary variable decl to use for an OpenACC worker reduction. */
+
+static tree
+gcn_goacc_get_worker_red_decl (tree type, unsigned offset)
+{
+ machine_function *machfun = cfun->machine;
+ tree existing_decl;
+
+ if (TREE_CODE (type) == REFERENCE_TYPE)
+ type = TREE_TYPE (type);
+
+ tree var_type
+ = build_qualified_type (type,
+ (TYPE_QUALS (type)
+ | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS)));
+
+ if (machfun->reduc_decls
+ && offset < machfun->reduc_decls->length ()
+ && (existing_decl = (*machfun->reduc_decls)[offset]))
+ {
+ gcc_assert (TREE_TYPE (existing_decl) == var_type);
+ return existing_decl;
+ }
+ else
+ {
+ char name[50];
+ sprintf (name, ".oacc_reduction_%u", offset);
+ tree decl = create_tmp_var_raw (var_type, name);
+
+ DECL_CONTEXT (decl) = NULL_TREE;
+ TREE_STATIC (decl) = 1;
+
+ varpool_node::finalize_decl (decl);
+
+ vec_safe_grow_cleared (machfun->reduc_decls, offset + 1);
+ (*machfun->reduc_decls)[offset] = decl;
+
+ return decl;
+ }
+
+ return NULL_TREE;
+}
+
+/* Expand IFN_GOACC_REDUCTION_SETUP. */
+
+static void
+gcn_goacc_reduction_setup (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ tree lhs = gimple_call_lhs (call);
+ tree var = gimple_call_arg (call, 2);
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+ gimple_seq seq = NULL;
+
+ push_gimplify_context (true);
+
+ if (level != GOMP_DIM_GANG)
+ {
+ /* Copy the receiver object. */
+ tree ref_to_res = gimple_call_arg (call, 1);
+
+ if (!integer_zerop (ref_to_res))
+ var = build_simple_mem_ref (ref_to_res);
+ }
+
+ if (level == GOMP_DIM_WORKER)
+ {
+ tree var_type = TREE_TYPE (var);
+ /* Store incoming value to worker reduction buffer. */
+ tree offset = gimple_call_arg (call, 5);
+ tree decl
+ = gcn_goacc_get_worker_red_decl (var_type, TREE_INT_CST_LOW (offset));
+
+ gimplify_assign (decl, var, &seq);
+ }
+
+ if (lhs)
+ gimplify_assign (lhs, var, &seq);
+
+ pop_gimplify_context (NULL);
+ gsi_replace_with_seq (&gsi, seq, true);
+}
+
+/* Expand IFN_GOACC_REDUCTION_INIT. */
+
+static void
+gcn_goacc_reduction_init (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ tree lhs = gimple_call_lhs (call);
+ tree var = gimple_call_arg (call, 2);
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+ enum tree_code rcode
+ = (enum tree_code) TREE_INT_CST_LOW (gimple_call_arg (call, 4));
+ tree init = omp_reduction_init_op (gimple_location (call), rcode,
+ TREE_TYPE (var));
+ gimple_seq seq = NULL;
+
+ push_gimplify_context (true);
+
+ if (level == GOMP_DIM_GANG)
+ {
+ /* If there's no receiver object, propagate the incoming VAR. */
+ tree ref_to_res = gimple_call_arg (call, 1);
+ if (integer_zerop (ref_to_res))
+ init = var;
+ }
+
+ if (lhs)
+ gimplify_assign (lhs, init, &seq);
+
+ pop_gimplify_context (NULL);
+ gsi_replace_with_seq (&gsi, seq, true);
+}
+
+/* Expand IFN_GOACC_REDUCTION_FINI. */
+
+static void
+gcn_goacc_reduction_fini (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ tree lhs = gimple_call_lhs (call);
+ tree ref_to_res = gimple_call_arg (call, 1);
+ tree var = gimple_call_arg (call, 2);
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+ enum tree_code op
+ = (enum tree_code) TREE_INT_CST_LOW (gimple_call_arg (call, 4));
+ gimple_seq seq = NULL;
+ tree r = NULL_TREE;;
+
+ push_gimplify_context (true);
+
+ tree accum = NULL_TREE;
+
+ if (level == GOMP_DIM_WORKER)
+ {
+ tree var_type = TREE_TYPE (var);
+ tree offset = gimple_call_arg (call, 5);
+ tree decl
+ = gcn_goacc_get_worker_red_decl (var_type, TREE_INT_CST_LOW (offset));
+
+ accum = build_fold_addr_expr (decl);
+ }
+ else if (integer_zerop (ref_to_res))
+ r = var;
+ else
+ accum = ref_to_res;
+
+ if (accum)
+ {
+ /* UPDATE the accumulator. */
+ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+ seq = NULL;
+ r = gcn_reduction_update (gimple_location (call), &gsi, accum, var, op);
+ }
+
+ if (lhs)
+ gimplify_assign (lhs, r, &seq);
+ pop_gimplify_context (NULL);
+
+ gsi_replace_with_seq (&gsi, seq, true);
+}
+
+/* Expand IFN_GOACC_REDUCTION_TEARDOWN. */
+
+static void
+gcn_goacc_reduction_teardown (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ tree lhs = gimple_call_lhs (call);
+ tree var = gimple_call_arg (call, 2);
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+ gimple_seq seq = NULL;
+
+ push_gimplify_context (true);
+
+ if (level == GOMP_DIM_WORKER)
+ {
+ tree var_type = TREE_TYPE (var);
+
+ /* Read the worker reduction buffer. */
+ tree offset = gimple_call_arg (call, 5);
+ tree decl
+ = gcn_goacc_get_worker_red_decl (var_type, TREE_INT_CST_LOW (offset));
+ var = decl;
+ }
+
+ if (level != GOMP_DIM_GANG)
+ {
+ /* Write to the receiver object. */
+ tree ref_to_res = gimple_call_arg (call, 1);
+
+ if (!integer_zerop (ref_to_res))
+ gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+ }
+
+ if (lhs)
+ gimplify_assign (lhs, var, &seq);
+
+ pop_gimplify_context (NULL);
+
+ gsi_replace_with_seq (&gsi, seq, true);
+}
+
+/* Implement TARGET_GOACC_REDUCTION.
+
+ Expand calls to the GOACC REDUCTION internal function, into a sequence of
+ gimple instructions. */
+
+void
+gcn_goacc_reduction (gcall *call)
+{
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+
+ if (level == GOMP_DIM_VECTOR)
+ {
+ default_goacc_reduction (call);
+ return;
+ }
+
+ unsigned code = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+
+ switch (code)
+ {
+ case IFN_GOACC_REDUCTION_SETUP:
+ gcn_goacc_reduction_setup (call);
+ break;
+
+ case IFN_GOACC_REDUCTION_INIT:
+ gcn_goacc_reduction_init (call);
+ break;
+
+ case IFN_GOACC_REDUCTION_FINI:
+ gcn_goacc_reduction_fini (call);
+ break;
+
+ case IFN_GOACC_REDUCTION_TEARDOWN:
+ gcn_goacc_reduction_teardown (call);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD.
+
+ Tweak (worker) propagation record, e.g. to put it in shared memory. */
+
+tree
+gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+ const char *name)
+{
+ tree type = record_type;
+
+ TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
+
+ if (!sender)
+ type = build_pointer_type (type);
+
+ tree decl = create_tmp_var_raw (type, name);
+
+ if (sender)
+ {
+ DECL_CONTEXT (decl) = NULL_TREE;
+ TREE_STATIC (decl) = 1;
+ }
+
+ if (sender)
+ varpool_node::finalize_decl (decl);
+
+ return decl;
+}
+
+void
+gcn_goacc_adjust_gangprivate_decl (tree var)
+{
+ tree type = TREE_TYPE (var);
+ tree lds_type = build_qualified_type (type,
+ TYPE_QUALS_NO_ADDR_SPACE (type)
+ | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS));
+ machine_function *machfun = cfun->machine;
+
+ TREE_TYPE (var) = lds_type;
+ TREE_STATIC (var) = 1;
+
+ /* We're making VAR static. We have to mangle the name to avoid collisions
+ between different local variables that share the same names. */
+ lhd_set_decl_assembler_name (var);
+
+ varpool_node::finalize_decl (var);
+
+ if (machfun)
+ machfun->use_flat_addressing = true;
+}
+
+/* }}} */
new file mode 100644
@@ -0,0 +1,3509 @@
+;; Copyright (C) 2016-2018 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; {{{ Vector iterators
+
+; Vector modes for one vector register
+(define_mode_iterator VEC_1REG_MODE
+ [V64QI V64HI V64SI V64HF V64SF])
+(define_mode_iterator VEC_1REG_ALT
+ [V64QI V64HI V64SI V64HF V64SF])
+
+(define_mode_iterator VEC_1REG_INT_MODE
+ [V64QI V64HI V64SI])
+(define_mode_iterator VEC_1REG_INT_ALT
+ [V64QI V64HI V64SI])
+
+(define_mode_iterator SCALAR_1REG_INT_MODE
+ [QI HI SI])
+
+; Vector modes for two vector registers
+(define_mode_iterator VEC_2REG_MODE
+ [V64DI V64DF])
+
+; All of above
+(define_mode_iterator VEC_REG_MODE
+ [V64QI V64HI V64SI V64HF V64SF ; Single reg
+ V64DI V64DF]) ; Double reg
+
+(define_mode_attr scalar_mode
+ [(V64QI "qi") (V64HI "hi") (V64SI "si")
+ (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
+
+(define_mode_attr SCALAR_MODE
+ [(V64QI "QI") (V64HI "HI") (V64SI "SI")
+ (V64HF "HF") (V64SF "SF") (V64DI "DI") (V64DF "DF")])
+
+;; }}}
+;; {{{ Vector moves
+
+; This is the entry point for all vector register moves. Memory accesses can
+; come this way also, but will more usually use the reload_in/out,
+; gather/scatter, maskload/store, etc.
+
+(define_expand "mov<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+ (match_operand:VEC_REG_MODE 1 "general_operand"))]
+ ""
+ {
+ /* Do not attempt to move unspec vectors. */
+ if (GET_CODE (operands[1]) == UNSPEC
+ && XINT (operands[1], 1) == UNSPEC_VECTOR)
+ FAIL;
+
+ if (can_create_pseudo_p ())
+ {
+ rtx exec = gcn_full_exec_reg ();
+ rtx undef = gcn_gen_undef (<MODE>mode);
+
+ if (MEM_P (operands[0]))
+ {
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ rtx scratch = gen_rtx_SCRATCH (V64DImode);
+ rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, exec,
+ operands[0],
+ scratch);
+ emit_insn (gen_scatter<mode>_expr (expr, operands[1], a, v, exec));
+ }
+ else if (MEM_P (operands[1]))
+ {
+ rtx scratch = gen_rtx_SCRATCH (V64DImode);
+ rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, exec,
+ operands[1],
+ scratch);
+ emit_insn (gen_gather<mode>_expr (operands[0], expr, a, v, undef,
+ exec));
+ }
+ else
+ emit_insn (gen_mov<mode>_vector (operands[0], operands[1], exec,
+ undef));
+
+ DONE;
+ }
+ })
+
+; A vector move that does not reference EXEC explicitly, and therefore is
+; suitable for use during or after LRA. It uses the "exec" attribure instead.
+
+(define_insn "mov<mode>_full"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v")
+ (match_operand:VEC_1REG_MODE 1 "general_operand" "vA,B"))]
+ "lra_in_progress || reload_completed"
+ "v_mov_b32\t%0, %1"
+ [(set_attr "type" "vop1,vop1")
+ (set_attr "length" "4,8")
+ (set_attr "exec" "full")])
+
+(define_insn "mov<mode>_full"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v")
+ (match_operand:VEC_2REG_MODE 1 "general_operand" "vDB"))]
+ "lra_in_progress || reload_completed"
+ {
+ if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+ return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+ else
+ return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "exec" "full")])
+
+; A SGPR-base load looks like:
+; <load> v, Sg
+;
+; There's no hardware instruction that corresponds to this, but vector base
+; addresses are placed in an SGPR because it is easier to add to a vector.
+; We also have a temporary vT, and the vector v1 holding numbered lanes.
+;
+; Rewrite as:
+; vT = v1 << log2(element-size)
+; vT += Sg
+; flat_load v, vT
+
+(define_insn "mov<mode>_sgprbase"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "= v, v, v, m")
+ (unspec:VEC_1REG_MODE
+ [(match_operand:VEC_1REG_MODE 1 "general_operand" " vA,vB, m, v")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v,&v,&v,&v"))]
+ "lra_in_progress || reload_completed"
+ "@
+ v_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ #
+ #"
+ [(set_attr "type" "vop1,vop1,*,*")
+ (set_attr "length" "4,8,12,12")
+ (set_attr "exec" "full")])
+
+(define_insn "mov<mode>_sgprbase"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "= v, v, m")
+ (unspec:VEC_2REG_MODE
+ [(match_operand:VEC_2REG_MODE 1 "general_operand" "vDB, m, v")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v,&v,&v"))]
+ "lra_in_progress || reload_completed"
+ "@
+ * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+ else \
+ return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+ #
+ #"
+ [(set_attr "type" "vmult,*,*")
+ (set_attr "length" "8,12,12")
+ (set_attr "exec" "full")])
+
+; reload_in was once a standard name, but here it's only referenced by
+; gcn_secondary_reload. It allows a reload with a scratch register.
+
+(define_expand "reload_in<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "= v")
+ (match_operand:VEC_REG_MODE 1 "memory_operand" " m"))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v"))]
+ ""
+ {
+ emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+ DONE;
+ })
+
+; reload_out is similar to reload_in, above.
+
+(define_expand "reload_out<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "memory_operand" "= m")
+ (match_operand:VEC_REG_MODE 1 "register_operand" " v"))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v"))]
+ ""
+ {
+ emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+ DONE;
+ })
+
+; This is the 'normal' kind of vector move created before register allocation.
+
+(define_insn "mov<mode>_vector"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand"
+ "=v, v, v, v, v, m")
+ (vec_merge:VEC_1REG_MODE
+ (match_operand:VEC_1REG_MODE 1 "general_operand"
+ "vA, B, v,vA, m, v")
+ (match_operand:VEC_1REG_MODE 3 "gcn_alu_or_unspec_operand"
+ "U0,U0,vA,vA,U0,U0")
+ (match_operand:DI 2 "register_operand" " e, e,cV,Sg, e, e")))
+ (clobber (match_scratch:V64DI 4 "=X, X, X, X,&v,&v"))]
+ "!MEM_P (operands[0]) || REG_P (operands[1])"
+ "@
+ v_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ v_cndmask_b32\t%0, %3, %1, vcc
+ v_cndmask_b32\t%0, %3, %1, %2
+ #
+ #"
+ [(set_attr "type" "vop1,vop1,vop2,vop3a,*,*")
+ (set_attr "length" "4,8,4,8,16,16")
+ (set_attr "exec" "*,*,full,full,*,*")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+(define_insn "*mov<mode>_vector_match"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v, v, m")
+ (vec_merge:VEC_1REG_MODE
+ (match_operand:VEC_1REG_MODE 1 "general_operand" "vA,B, m, v")
+ (match_dup 0)
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e,e, e, e")))
+ (clobber (match_scratch:V64DI 3 "=X,X,&v,&v"))]
+ "!MEM_P (operands[0]) || REG_P (operands[1])"
+ "@
+ v_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ #
+ #"
+ [(set_attr "type" "vop1,vop1,*,*")
+ (set_attr "length" "4,8,16,16")])
+
+(define_insn "mov<mode>_vector"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand"
+ "= v, v, v, v, m")
+ (vec_merge:VEC_2REG_MODE
+ (match_operand:VEC_2REG_MODE 1 "general_operand"
+ "vDB, v0, v0, m, v")
+ (match_operand:VEC_2REG_MODE 3 "gcn_alu_or_unspec_operand"
+ " U0,vDA0,vDA0,U0,U0")
+ (match_operand:DI 2 "register_operand" " e, cV, Sg, e, e")))
+ (clobber (match_scratch:V64DI 4 "= X, X, X,&v,&v"))]
+ "!MEM_P (operands[0]) || REG_P (operands[1])"
+ {
+ if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+ switch (which_alternative)
+ {
+ case 0:
+ return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+ case 1:
+ return "v_cndmask_b32\t%L0, %L3, %L1, vcc\;"
+ "v_cndmask_b32\t%H0, %H3, %H1, vcc";
+ case 2:
+ return "v_cndmask_b32\t%L0, %L3, %L1, %2\;"
+ "v_cndmask_b32\t%H0, %H3, %H1, %2";
+ }
+ else
+ switch (which_alternative)
+ {
+ case 0:
+ return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+ case 1:
+ return "v_cndmask_b32\t%H0, %H3, %H1, vcc\;"
+ "v_cndmask_b32\t%L0, %L3, %L1, vcc";
+ case 2:
+ return "v_cndmask_b32\t%H0, %H3, %H1, %2\;"
+ "v_cndmask_b32\t%L0, %L3, %L1, %2";
+ }
+
+ return "#";
+ }
+ [(set_attr "type" "vmult,vmult,vmult,*,*")
+ (set_attr "length" "16,16,16,16,16")
+ (set_attr "exec" "*,full,full,*,*")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+(define_insn "*mov<mode>_vector_match"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v, v, m")
+ (vec_merge:VEC_2REG_MODE
+ (match_operand:VEC_2REG_MODE 1 "general_operand" "vDB, m, v")
+ (match_dup 0)
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e, e, e")))
+ (clobber (match_scratch:V64DI 3 "=X,&v,&v"))]
+ "!MEM_P (operands[0]) || REG_P (operands[1])"
+ "@
+ * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+ else \
+ return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+ #
+ #"
+ [(set_attr "type" "vmult,*,*")
+ (set_attr "length" "16,16,16")])
+
+; Expand scalar addresses into gather/scatter patterns
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+ (unspec:VEC_REG_MODE
+ [(match_operand:VEC_REG_MODE 1 "general_operand")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_scratch:V64DI 2))]
+ ""
+ [(set (mem:BLK (scratch))
+ (unspec:BLK [(match_dup 5) (match_dup 1)
+ (match_dup 6) (match_dup 7) (match_dup 8)]
+ UNSPEC_SCATTER))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+ operands[0],
+ operands[2]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ operands[8] = gen_rtx_CONST_INT (VOIDmode, -1);
+ })
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+ (vec_merge:VEC_REG_MODE
+ (match_operand:VEC_REG_MODE 1 "general_operand")
+ (match_operand:VEC_REG_MODE 3 "")
+ (match_operand:DI 2 "gcn_exec_reg_operand")))
+ (clobber (match_scratch:V64DI 4))]
+ ""
+ [(set (mem:BLK (scratch))
+ (unspec:BLK [(match_dup 5) (match_dup 1)
+ (match_dup 6) (match_dup 7) (match_dup 2)]
+ UNSPEC_SCATTER))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+ operands[2],
+ operands[0],
+ operands[4]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ })
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+ (unspec:VEC_REG_MODE
+ [(match_operand:VEC_REG_MODE 1 "memory_operand")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_scratch:V64DI 2))]
+ ""
+ [(set (match_dup 0)
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_dup 8)
+ (match_dup 9)))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+ operands[1],
+ operands[2]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ operands[8] = gcn_gen_undef (<MODE>mode);
+ operands[9] = gen_rtx_CONST_INT (VOIDmode, -1);
+ })
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+ (vec_merge:VEC_REG_MODE
+ (match_operand:VEC_REG_MODE 1 "memory_operand")
+ (match_operand:VEC_REG_MODE 3 "")
+ (match_operand:DI 2 "gcn_exec_reg_operand")))
+ (clobber (match_scratch:V64DI 4))]
+ ""
+ [(set (match_dup 0)
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_dup 3)
+ (match_dup 2)))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+ operands[2],
+ operands[1],
+ operands[4]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ })
+
+; TODO: Add zero/sign extending variants.
+
+;; }}}
+;; {{{ Lane moves
+
+; v_writelane and v_readlane work regardless of exec flags.
+; We allow source to be scratch.
+;
+; FIXME these should take A immediates
+
+(define_insn "*vec_set<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_1REG_MODE
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" " SS"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (ashift (const_int 1)
+ (match_operand:SI 2 "gcn_alu_operand" "SSB"))))]
+ ""
+ "v_writelane_b32 %0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "laneselect" "yes")])
+
+; FIXME: 64bit operations really should be splitters, but I am not sure how
+; to represent vertical subregs.
+(define_insn "*vec_set<mode>"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_2REG_MODE
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" " SS"))
+ (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (ashift (const_int 1)
+ (match_operand:SI 2 "gcn_alu_operand" "SSB"))))]
+ ""
+ "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2"
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "laneselect" "yes")])
+
+(define_expand "vec_set<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+ (vec_merge:VEC_REG_MODE
+ (vec_duplicate:VEC_REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand"))
+ (match_dup 0)
+ (ashift (const_int 1) (match_operand:SI 2 "gcn_alu_operand"))))]
+ "")
+
+(define_insn "*vec_set<mode>_1"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_1REG_MODE
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" "SS"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:SI 2 "const_int_operand" " i")))]
+ "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+ {
+ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+ return "v_writelane_b32 %0, %1, %2";
+ }
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "laneselect" "yes")])
+
+(define_insn "*vec_set<mode>_1"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_2REG_MODE
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" "SS"))
+ (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:SI 2 "const_int_operand" " i")))]
+ "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+ {
+ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+ return "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2";
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "laneselect" "yes")])
+
+(define_insn "vec_duplicate<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SgB")))]
+ ""
+ "v_mov_b32\t%0, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "exec" "full")
+ (set_attr "length" "8")])
+
+(define_insn "vec_duplicate<mode>"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v")
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SgDB")))]
+ ""
+ "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
+ [(set_attr "type" "vop3a")
+ (set_attr "exec" "full")
+ (set_attr "length" "16")])
+
+(define_insn "vec_duplicate<mode>_exec"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_1REG_MODE
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SSB"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_mov_b32\t%0, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "vec_duplicate<mode>_exec"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_2REG_MODE
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" "SgDB"))
+ (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg")
+ (vec_select:<SCALAR_MODE>
+ (match_operand:VEC_1REG_MODE 1 "register_operand" " v")
+ (parallel [(match_operand:SI 2 "gcn_alu_operand" "SSB")])))]
+ ""
+ "v_readlane_b32 %0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "laneselect" "yes")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg")
+ (vec_select:<SCALAR_MODE>
+ (match_operand:VEC_2REG_MODE 1 "register_operand" " v")
+ (parallel [(match_operand:SI 2 "gcn_alu_operand" "SSB")])))]
+ ""
+ "v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2"
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "laneselect" "yes")])
+
+(define_expand "vec_init<mode><scalar_mode>"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand 1)]
+ ""
+ {
+ gcn_expand_vector_init (operands[0], operands[1]);
+ DONE;
+ })
+
+;; }}}
+;; {{{ Scatter / Gather
+
+;; GCN does not have an instruction for loading a vector from contiguous
+;; memory so *all* loads and stores are eventually converted to scatter
+;; or gather.
+;;
+;; GCC does not permit MEM to hold vectors of addresses, so we must use an
+;; unspec. The unspec formats are as follows:
+;;
+;; (unspec:V64??
+;; [(<address expression>)
+;; (<addr_space_t>)
+;; (<use_glc>)
+;; (mem:BLK (scratch))]
+;; UNSPEC_GATHER)
+;;
+;; (unspec:BLK
+;; [(<address expression>)
+;; (<source register>)
+;; (<addr_space_t>)
+;; (<use_glc>)
+;; (<exec>)]
+;; UNSPEC_SCATTER)
+;;
+;; - Loads are expected to be wrapped in a vec_merge, so do not need <exec>.
+;; - The mem:BLK does not contain any real information, but indicates that an
+;; unknown memory read is taking place. Stores are expected to use a similar
+;; mem:BLK outside the unspec.
+;; - The address space and glc (volatile) fields are there to replace the
+;; fields normally found in a MEM.
+;; - Multiple forms of address expression are supported, below.
+
+(define_expand "gather_load<mode>"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")]
+ ""
+ {
+ rtx exec = gcn_full_exec_reg ();
+
+ /* TODO: more conversions will be needed when more types are vectorized. */
+ if (GET_MODE (operands[2]) == V64DImode)
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_vec_truncatev64div64si (tmp, operands[2],
+ gcn_gen_undef (V64SImode),
+ exec));
+ operands[2] = tmp;
+ }
+
+ emit_insn (gen_gather<mode>_exec (operands[0], operands[1], operands[2],
+ operands[3], operands[4], exec));
+ DONE;
+ })
+
+(define_expand "gather<mode>_exec"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand:V64SI 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")
+ (match_operand:DI 5 "gcn_exec_reg_operand")]
+ ""
+ {
+ rtx dest = operands[0];
+ rtx base = operands[1];
+ rtx offsets = operands[2];
+ int unsignedp = INTVAL (operands[3]);
+ rtx scale = operands[4];
+ rtx exec = operands[5];
+
+ rtx tmpsi = gen_reg_rtx (V64SImode);
+ rtx tmpdi = gen_reg_rtx (V64DImode);
+ rtx undefsi = gcn_gen_undef (V64SImode);
+ rtx undefdi = gcn_gen_undef (V64DImode);
+ rtx undefmode = gcn_gen_undef (<MODE>mode);
+
+ if (CONST_INT_P (scale)
+ && INTVAL (scale) > 0
+ && exact_log2 (INTVAL (scale)) >= 0)
+ emit_insn (gen_ashlv64si3 (tmpsi, offsets,
+ GEN_INT (exact_log2 (INTVAL (scale)))));
+ else
+ emit_insn (gen_mulv64si3_vector_dup (tmpsi, offsets, scale, exec,
+ undefsi));
+
+ if (DEFAULT_ADDR_SPACE == ADDR_SPACE_FLAT)
+ {
+ if (unsignedp)
+ emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base, exec,
+ undefdi));
+ else
+ emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base, exec,
+ undefdi));
+ emit_insn (gen_gather<mode>_insn_1offset (dest, tmpdi, const0_rtx,
+ const0_rtx, const0_rtx,
+ undefmode, exec));
+ }
+ else if (DEFAULT_ADDR_SPACE == ADDR_SPACE_GLOBAL)
+ emit_insn (gen_gather<mode>_insn_2offsets (dest, base, tmpsi, const0_rtx,
+ const0_rtx, const0_rtx,
+ undefmode, exec));
+ else
+ gcc_unreachable ();
+ DONE;
+ })
+
+; Allow any address expression
+(define_expand "gather<mode>_expr"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE
+ [(match_operand 1 "")
+ (match_operand 2 "immediate_operand")
+ (match_operand 3 "immediate_operand")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_operand:VEC_REG_MODE 4 "gcn_register_or_unspec_operand")
+ (match_operand:DI 5 "gcn_exec_operand")))]
+ ""
+ {})
+
+(define_insn "gather<mode>_insn_1offset"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v, v")
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE
+ [(plus:V64DI (match_operand:V64DI 1 "register_operand" " v, v")
+ (vec_duplicate:V64DI
+ (match_operand 2 "immediate_operand" " n, n")))
+ (match_operand 3 "immediate_operand" " n, n")
+ (match_operand 4 "immediate_operand" " n, n")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_operand:VEC_REG_MODE 5 "gcn_register_or_unspec_operand"
+ "U0, U0")
+ (match_operand:DI 6 "gcn_exec_operand" " e,*Kf")))]
+ "(AS_FLAT_P (INTVAL (operands[3]))
+ && ((TARGET_GCN3 && INTVAL(operands[2]) == 0)
+ || ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000)))
+ || (AS_GLOBAL_P (INTVAL (operands[3]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_FLAT_P (as))
+ {
+ if (TARGET_GCN5_PLUS)
+ sprintf (buf, "flat_load%%s0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0",
+ glc);
+ else
+ sprintf (buf, "flat_load%%s0\t%%0, %%1%s\;s_waitcnt\t0", glc);
+ }
+ else if (AS_GLOBAL_P (as))
+ sprintf (buf, "global_load%%s0\t%%0, %%1, off offset:%%2%s\;"
+ "s_waitcnt\tvmcnt(0)", glc);
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")
+ (set_attr "exec" "*,full")])
+
+(define_insn "gather<mode>_insn_1offset_ds"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v, v")
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE
+ [(plus:V64SI (match_operand:V64SI 1 "register_operand" " v, v")
+ (vec_duplicate:V64SI
+ (match_operand 2 "immediate_operand" " n, n")))
+ (match_operand 3 "immediate_operand" " n, n")
+ (match_operand 4 "immediate_operand" " n, n")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_operand:VEC_REG_MODE 5 "gcn_register_or_unspec_operand"
+ "U0, U0")
+ (match_operand:DI 6 "gcn_exec_operand" " e,*Kf")))]
+ "(AS_ANY_DS_P (INTVAL (operands[3]))
+ && ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x10000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ static char buf[200];
+ sprintf (buf, "ds_read%%b0\t%%0, %%1 offset:%%2%s\;s_waitcnt\tlgkmcnt(0)",
+ (AS_GDS_P (as) ? " gds" : ""));
+ return buf;
+ }
+ [(set_attr "type" "ds")
+ (set_attr "length" "12")
+ (set_attr "exec" "*,full")])
+
+(define_insn "gather<mode>_insn_2offsets"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE
+ [(plus:V64DI
+ (plus:V64DI
+ (vec_duplicate:V64DI
+ (match_operand:DI 1 "register_operand" "SS"))
+ (sign_extend:V64DI
+ (match_operand:V64SI 2 "register_operand" " v")))
+ (vec_duplicate:V64DI (match_operand 3 "immediate_operand"
+ " n")))
+ (match_operand 4 "immediate_operand" " n")
+ (match_operand 5 "immediate_operand" " n")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_operand:VEC_REG_MODE 6 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:DI 7 "gcn_exec_operand" " e")))]
+ "(AS_GLOBAL_P (INTVAL (operands[4]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[3]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[4]);
+ const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_GLOBAL_P (as))
+ {
+ /* Work around assembler bug in which a 64-bit register is expected,
+ but a 32-bit value would be correct. */
+ int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
+ sprintf (buf, "global_load%%s0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
+ "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+ }
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")])
+
+(define_expand "scatter_store<mode>"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:VEC_REG_MODE 4 "register_operand")]
+ ""
+ {
+ rtx exec = gcn_full_exec_reg ();
+
+ /* TODO: more conversions will be needed when more types are vectorized. */
+ if (GET_MODE (operands[1]) == V64DImode)
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_vec_truncatev64div64si (tmp, operands[1],
+ gcn_gen_undef (V64SImode),
+ exec));
+ operands[1] = tmp;
+ }
+
+ emit_insn (gen_scatter<mode>_exec (operands[0], operands[1], operands[2],
+ operands[3], operands[4], exec));
+ DONE;
+ })
+
+(define_expand "scatter<mode>_exec"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:VEC_REG_MODE 4 "register_operand")
+ (match_operand:DI 5 "gcn_exec_reg_operand")]
+ ""
+ {
+ rtx base = operands[0];
+ rtx offsets = operands[1];
+ int unsignedp = INTVAL (operands[2]);
+ rtx scale = operands[3];
+ rtx src = operands[4];
+ rtx exec = operands[5];
+
+ rtx tmpsi = gen_reg_rtx (V64SImode);
+ rtx tmpdi = gen_reg_rtx (V64DImode);
+ rtx undefsi = gcn_gen_undef (V64SImode);
+ rtx undefdi = gcn_gen_undef (V64DImode);
+
+ if (CONST_INT_P (scale)
+ && INTVAL (scale) > 0
+ && exact_log2 (INTVAL (scale)) >= 0)
+ emit_insn (gen_ashlv64si3 (tmpsi, offsets,
+ GEN_INT (exact_log2 (INTVAL (scale)))));
+ else
+ emit_insn (gen_mulv64si3_vector_dup (tmpsi, offsets, scale, exec,
+ undefsi));
+
+ if (DEFAULT_ADDR_SPACE == ADDR_SPACE_FLAT)
+ {
+ if (unsignedp)
+ emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base, exec,
+ undefdi));
+ else
+ emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base, exec,
+ undefdi));
+ emit_insn (gen_scatter<mode>_insn_1offset (tmpdi, const0_rtx, src,
+ const0_rtx, const0_rtx,
+ exec));
+ }
+ else if (DEFAULT_ADDR_SPACE == ADDR_SPACE_GLOBAL)
+ emit_insn (gen_scatter<mode>_insn_2offsets (base, tmpsi, const0_rtx, src,
+ const0_rtx, const0_rtx,
+ exec));
+ else
+ gcc_unreachable ();
+ DONE;
+ })
+
+; Allow any address expression
+(define_expand "scatter<mode>_expr"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:V64DI 0 "")
+ (match_operand:VEC_REG_MODE 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:DI 4 "gcn_exec_operand")]
+ UNSPEC_SCATTER))]
+ ""
+ {})
+
+(define_insn "scatter<mode>_insn_1offset"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(plus:V64DI (match_operand:V64DI 0 "register_operand" "v, v")
+ (vec_duplicate:V64DI
+ (match_operand 1 "immediate_operand" "n, n")))
+ (match_operand:VEC_REG_MODE 2 "register_operand" "v, v")
+ (match_operand 3 "immediate_operand" "n, n")
+ (match_operand 4 "immediate_operand" "n, n")
+ (match_operand:DI 5 "gcn_exec_operand" "e,*Kf")]
+ UNSPEC_SCATTER))]
+ "(AS_FLAT_P (INTVAL (operands[3]))
+ && (INTVAL(operands[1]) == 0
+ || (TARGET_GCN5_PLUS
+ && (unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000)))
+ || (AS_GLOBAL_P (INTVAL (operands[3]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_FLAT_P (as))
+ {
+ if (TARGET_GCN5_PLUS)
+ sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;s_waitcnt\t0",
+ glc);
+ else
+ sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\t0", glc);
+ }
+ else if (AS_GLOBAL_P (as))
+ sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
+ "s_waitcnt\tvmcnt(0)", glc);
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")
+ (set_attr "exec" "*,full")])
+
+(define_insn "scatter<mode>_insn_1offset_ds"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(plus:V64SI (match_operand:V64SI 0 "register_operand" "v, v")
+ (vec_duplicate:V64SI
+ (match_operand 1 "immediate_operand" "n, n")))
+ (match_operand:VEC_REG_MODE 2 "register_operand" "v, v")
+ (match_operand 3 "immediate_operand" "n, n")
+ (match_operand 4 "immediate_operand" "n, n")
+ (match_operand:DI 5 "gcn_exec_operand" "e,*Kf")]
+ UNSPEC_SCATTER))]
+ "(AS_ANY_DS_P (INTVAL (operands[3]))
+ && ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ static char buf[200];
+ sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\tlgkmcnt(0)",
+ (AS_GDS_P (as) ? " gds" : ""));
+ return buf;
+ }
+ [(set_attr "type" "ds")
+ (set_attr "length" "12")
+ (set_attr "exec" "*,full")])
+
+(define_insn "scatter<mode>_insn_2offsets"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(plus:V64DI
+ (plus:V64DI
+ (vec_duplicate:V64DI
+ (match_operand:DI 0 "register_operand" "SS"))
+ (sign_extend:V64DI
+ (match_operand:V64SI 1 "register_operand" " v")))
+ (vec_duplicate:V64DI (match_operand 2 "immediate_operand" " n")))
+ (match_operand:VEC_REG_MODE 3 "register_operand" " v")
+ (match_operand 4 "immediate_operand" " n")
+ (match_operand 5 "immediate_operand" " n")
+ (match_operand:DI 6 "gcn_exec_operand" " e")]
+ UNSPEC_SCATTER))]
+ "(AS_GLOBAL_P (INTVAL (operands[4]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[4]);
+ const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_GLOBAL_P (as))
+ {
+ /* Work around assembler bug in which a 64-bit register is expected,
+ but a 32-bit value would be correct. */
+ int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
+ sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
+ "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+ }
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")])
+
+;; }}}
+;; {{{ Permutations
+
+(define_insn "ds_bpermute<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_1REG_MODE
+ [(match_operand:VEC_1REG_MODE 2 "register_operand" " v")
+ (match_operand:V64SI 1 "register_operand" " v")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")]
+ UNSPEC_BPERMUTE))]
+ ""
+ "ds_bpermute_b32\t%0, %1, %2\;s_waitcnt\tlgkmcnt(0)"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "12")])
+
+(define_insn_and_split "ds_bpermute<mode>"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "=&v")
+ (unspec:VEC_2REG_MODE
+ [(match_operand:VEC_2REG_MODE 2 "register_operand" " v0")
+ (match_operand:V64SI 1 "register_operand" " v")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")]
+ UNSPEC_BPERMUTE))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 4) (unspec:V64SI [(match_dup 6) (match_dup 1) (match_dup 3)]
+ UNSPEC_BPERMUTE))
+ (set (match_dup 5) (unspec:V64SI [(match_dup 7) (match_dup 1) (match_dup 3)]
+ UNSPEC_BPERMUTE))]
+ {
+ operands[4] = gcn_operand_part (<MODE>mode, operands[0], 0);
+ operands[5] = gcn_operand_part (<MODE>mode, operands[0], 1);
+ operands[6] = gcn_operand_part (<MODE>mode, operands[2], 0);
+ operands[7] = gcn_operand_part (<MODE>mode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "24")])
+
+;; }}}
+;; {{{ ALU special case: add/sub
+
+(define_mode_iterator V64SIDI [V64SI V64DI])
+
+(define_expand "<expander><mode>3"
+ [(parallel [(set (match_operand:V64SIDI 0 "register_operand")
+ (vec_merge:V64SIDI
+ (plus_minus:V64SIDI
+ (match_operand:V64SIDI 1 "register_operand")
+ (match_operand:V64SIDI 2 "gcn_alu_operand"))
+ (match_dup 4)
+ (match_dup 3)))
+ (clobber (reg:DI VCC_REG))])]
+ ""
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_insn "addv64si3_vector"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (vec_merge:V64SI
+ (plus:V64SI
+ (match_operand:V64SI 1 "register_operand" "% v")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSSB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "addsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= v")
+ (plus:SI
+ (match_operand:SI 1 "register_operand" "% v")
+ (match_operand:SI 2 "gcn_alu_operand" "vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e"))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "addv64si3_vector_dup"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (vec_merge:V64SI
+ (plus:V64SI
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" "SSB,SSB"))
+ (match_operand:V64SI 1 "register_operand" " v, v"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e,*Kf")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")
+ (set_attr "exec" "*,full")])
+
+(define_insn "addv64si3_vector_vcc"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (vec_merge:V64SI
+ (plus:V64SI
+ (match_operand:V64SI 1 "register_operand" "% v, v")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSSB,vSSB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand"
+ " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (set (match_operand:DI 5 "register_operand" "= cV, Sg")
+ (ior:DI (and:DI (ltu:DI (plus:V64SI (match_dup 1) (match_dup 2))
+ (match_dup 1))
+ (match_dup 3))
+ (and:DI (not:DI (match_dup 3))
+ (match_operand:DI 6 "gcn_register_or_unspec_operand"
+ " U5, U5"))))]
+ ""
+ "v_add%^_u32\t%0, %5, %2, %1"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "8")])
+
+; This pattern only changes the VCC bits when the corresponding lane is
+; enabled, so the set must be described as an ior.
+
+(define_insn "addv64si3_vector_vcc_dup"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (vec_merge:V64SI
+ (plus:V64SI
+ (vec_duplicate:V64SI (match_operand:SI 2 "gcn_alu_operand"
+ "SSB,SSB"))
+ (match_operand:V64SI 1 "register_operand" " v, v"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (set (match_operand:DI 5 "register_operand" "=cV, Sg")
+ (ior:DI (and:DI (ltu:DI (plus:V64SI (vec_duplicate:V64SI (match_dup 2))
+ (match_dup 1))
+ (vec_duplicate:V64SI (match_dup 2)))
+ (match_dup 3))
+ (and:DI (not:DI (match_dup 3))
+ (match_operand:DI 6 "gcn_register_or_unspec_operand"
+ " 5U, 5U"))))]
+ ""
+ "v_add%^_u32\t%0, %5, %2, %1"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "8,8")])
+
+; This pattern does not accept SGPR because VCC read already counts as an
+; SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "addcv64si3_vec"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+ (vec_merge:V64SI
+ (plus:V64SI
+ (plus:V64SI
+ (vec_merge:V64SI
+ (match_operand:V64SI 7 "gcn_vec1_operand" " A, A")
+ (match_operand:V64SI 8 "gcn_vec0_operand" " A, A")
+ (match_operand:DI 5 "register_operand" " cV,Sg"))
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vA,vA"))
+ (match_operand:V64SI 2 "gcn_alu_operand" " vB,vB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0,U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (set (match_operand:DI 6 "register_operand" "=cV,Sg")
+ (ior:DI (and:DI (ior:DI (ltu:DI (plus:V64SI (plus:V64SI
+ (vec_merge:V64SI
+ (match_dup 7)
+ (match_dup 8)
+ (match_dup 5))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 2))
+ (ltu:DI (plus:V64SI (vec_merge:V64SI
+ (match_dup 7)
+ (match_dup 8)
+ (match_dup 5))
+ (match_dup 1))
+ (match_dup 1)))
+ (match_dup 3))
+ (and:DI (not:DI (match_dup 3))
+ (match_operand:DI 9 "gcn_register_or_unspec_operand"
+ " 6U,6U"))))]
+ ""
+ "v_addc%^_u32\t%0, %6, %1, %2, %5"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "4,8")])
+
+(define_insn "addcv64si3_vec_dup"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+ (vec_merge:V64SI
+ (plus:V64SI
+ (plus:V64SI
+ (vec_merge:V64SI
+ (match_operand:V64SI 7 "gcn_vec1_operand" " A, A")
+ (match_operand:V64SI 8 "gcn_vec0_operand" " A, A")
+ (match_operand:DI 5 "register_operand" " cV, Sg"))
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vA, vA"))
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" "SSB,SSB")))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (set (match_operand:DI 6 "register_operand" "=cV, Sg")
+ (ior:DI (and:DI (ior:DI (ltu:DI (plus:V64SI (plus:V64SI
+ (vec_merge:V64SI
+ (match_dup 7)
+ (match_dup 8)
+ (match_dup 5))
+ (match_dup 1))
+ (vec_duplicate:V64SI
+ (match_dup 2)))
+ (vec_duplicate:V64SI
+ (match_dup 2)))
+ (ltu:DI (plus:V64SI (vec_merge:V64SI
+ (match_dup 7)
+ (match_dup 8)
+ (match_dup 5))
+ (match_dup 1))
+ (match_dup 1)))
+ (match_dup 3))
+ (and:DI (not:DI (match_dup 3))
+ (match_operand:DI 9 "gcn_register_or_unspec_operand"
+ " 6U,6U"))))]
+ ""
+ "v_addc%^_u32\t%0, %6, %1, %2, %5"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "4,8")])
+
+(define_insn "subv64si3_vector"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (vec_merge:V64SI
+ (minus:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "vSSB, v")
+ (match_operand:V64SI 2 "gcn_alu_operand" " v,vSSB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (clobber (reg:DI VCC_REG))]
+ "register_operand (operands[1], VOIDmode)
+ || register_operand (operands[2], VOIDmode)"
+ "@
+ v_sub%^_u32\t%0, vcc, %1, %2
+ v_subrev%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+(define_insn "subsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= v, v")
+ (minus:SI
+ (match_operand:SI 1 "gcn_alu_operand" "vSSB, v")
+ (match_operand:SI 2 "gcn_alu_operand" " v,vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e, e"))
+ (clobber (reg:DI VCC_REG))]
+ "register_operand (operands[1], VOIDmode)
+ || register_operand (operands[2], VOIDmode)"
+ "@
+ v_sub%^_u32\t%0, vcc, %1, %2
+ v_subrev%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+(define_insn "subv64si3_vector_vcc"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v, v, v")
+ (vec_merge:V64SI
+ (minus:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "vSSB,vSSB, v, v")
+ (match_operand:V64SI 2 "gcn_alu_operand" " v, v,vSSB,vSSB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand"
+ " U0, U0, U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e, e, e")))
+ (set (match_operand:DI 5 "register_operand" "= cV, Sg, cV, Sg")
+ (ior:DI (and:DI (gtu:DI (minus:V64SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 1))
+ (match_dup 3))
+ (and:DI (not:DI (match_dup 3))
+ (match_operand:DI 6 "gcn_register_or_unspec_operand"
+ " 5U, 5U, 5U, 5U"))))]
+ "register_operand (operands[1], VOIDmode)
+ || register_operand (operands[2], VOIDmode)"
+ "@
+ v_sub%^_u32\t%0, %5, %1, %2
+ v_sub%^_u32\t%0, %5, %1, %2
+ v_subrev%^_u32\t%0, %5, %2, %1
+ v_subrev%^_u32\t%0, %5, %2, %1"
+ [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+ (set_attr "length" "8")])
+
+; This pattern does not accept SGPR because VCC read already counts
+; as a SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "subcv64si3_vec"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v, v, v")
+ (vec_merge:V64SI
+ (minus:V64SI
+ (minus:V64SI
+ (vec_merge:V64SI
+ (match_operand:V64SI 7 "gcn_vec1_operand" " A, A, A, A")
+ (match_operand:V64SI 8 "gcn_vec0_operand" " A, A, A, A")
+ (match_operand:DI 5 "gcn_alu_operand" " cV,Sg,cV,Sg"))
+ (match_operand:V64SI 1 "gcn_alu_operand" " vA,vA,vB,vB"))
+ (match_operand:V64SI 2 "gcn_alu_operand" " vB,vB,vA,vA"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand"
+ " U0,U0,U0,U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e, e, e")))
+ (set (match_operand:DI 6 "register_operand" "=cV,Sg,cV,Sg")
+ (ior:DI (and:DI (ior:DI (gtu:DI (minus:V64SI (minus:V64SI
+ (vec_merge:V64SI
+ (match_dup 7)
+ (match_dup 8)
+ (match_dup 5))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 2))
+ (ltu:DI (minus:V64SI (vec_merge:V64SI
+ (match_dup 7)
+ (match_dup 8)
+ (match_dup 5))
+ (match_dup 1))
+ (match_dup 1)))
+ (match_dup 3))
+ (and:DI (not:DI (match_dup 3))
+ (match_operand:DI 9 "gcn_register_or_unspec_operand"
+ " 6U,6U,6U,6U"))))]
+ "register_operand (operands[1], VOIDmode)
+ || register_operand (operands[2], VOIDmode)"
+ "@
+ v_subb%^_u32\t%0, %6, %1, %2, %5
+ v_subb%^_u32\t%0, %6, %1, %2, %5
+ v_subbrev%^_u32\t%0, %6, %2, %1, %5
+ v_subbrev%^_u32\t%0, %6, %2, %1, %5"
+ [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_vector"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (match_operand:V64DI 1 "register_operand" "% v0")
+ (match_operand:V64DI 2 "gcn_alu_operand" "vSSB0"))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vector_vcc
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ emit_insn (gen_addcv64si3_vec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "subv64di3_vector"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v, &v")
+ (vec_merge:V64DI
+ (minus:V64DI
+ (match_operand:V64DI 1 "gcn_alu_operand" "vSSB0, v0")
+ (match_operand:V64DI 2 "gcn_alu_operand" " v0,vSSB0"))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand"
+ " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (clobber (reg:DI VCC_REG))]
+ "register_operand (operands[1], VOIDmode)
+ || register_operand (operands[2], VOIDmode)"
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_subv64si3_vector_vcc
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ emit_insn (gen_subcv64si3_vec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_vector_dup"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (match_operand:V64DI 1 "register_operand" " v0")
+ (vec_duplicate:V64DI
+ (match_operand:DI 2 "gcn_alu_operand" "SSDB")))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vector_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ emit_insn (gen_addcv64si3_vec_dup
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v,&v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" "0vA,0vB"))
+ (match_operand:V64DI 2 "gcn_alu_operand" "0vB,0vA"))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vector_vcc
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ operands[1],
+ gcn_operand_part (V64DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ emit_insn (gen_addcv64si3_vec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ const0_rtx,
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_zext_dup"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (zero_extend:V64DI
+ (vec_duplicate:V64SI
+ (match_operand:SI 1 "gcn_alu_operand" "BSS")))
+ (match_operand:V64DI 2 "gcn_alu_operand" "vA0"))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vector_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ emit_insn (gen_addcv64si3_vec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ const0_rtx, operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup2"
+ [(set (match_operand:V64DI 0 "register_operand" "= v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+ " vA"))
+ (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSS")))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vector_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ operands[1],
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+ emit_insn (gen_vec_duplicatev64si_exec
+ (dsthi, gcn_operand_part (DImode, operands[2], 1),
+ operands[3], gcn_gen_undef (V64SImode)));
+ emit_insn (gen_addcv64si3_vec
+ (dsthi, dsthi, const0_rtx, operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_sext_dup2"
+ [(set (match_operand:V64DI 0 "register_operand" "= v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+ " vA"))
+ (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSS")))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "=&v"))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_ashrv64si3_vector (operands[5], operands[1], GEN_INT (31),
+ operands[3], gcn_gen_undef (V64SImode)));
+ emit_insn (gen_addv64si3_vector_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ operands[1],
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[3],
+ gcn_operand_part (V64DImode, operands[4], 0),
+ vcc, gcn_gen_undef (DImode)));
+ rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+ emit_insn (gen_vec_duplicatev64si_exec
+ (dsthi, gcn_operand_part (DImode, operands[2], 1),
+ operands[3], gcn_gen_undef (V64SImode)));
+ emit_insn (gen_addcv64si3_vec
+ (dsthi, dsthi, operands[5], operands[3],
+ gcn_operand_part (V64DImode, operands[4], 1),
+ vcc, vcc, gcn_vec_constant (V64SImode, 1),
+ gcn_vec_constant (V64SImode, 0),
+ gcn_gen_undef (DImode)));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn "addv64di3_scalarsi"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v, v")
+ (plus:V64DI (vec_duplicate:V64DI
+ (zero_extend:DI
+ (match_operand:SI 2 "register_operand" " Sg,Sg")))
+ (match_operand:V64DI 1 "register_operand" " v, 0")))]
+ ""
+ "v_add%^_u32\t%L0, vcc, %2, %L1\;v_addc%^_u32\t%H0, vcc, 0, %H1, vcc"
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")
+ (set_attr "exec" "full")])
+
+;; }}}
+;; {{{ DS memory ALU: add/sub
+
+(define_mode_iterator DS_ARITH_MODE [V64SI V64SF V64DI])
+(define_mode_iterator DS_ARITH_SCALAR_MODE [SI SF DI])
+
+;; FIXME: the vector patterns probably need RD expanded to a vector of
+;; addresses. For now, the only way a vector can get into LDS is
+;; if the user puts it there manually.
+;;
+;; FIXME: the scalar patterns are probably fine in themselves, but need to be
+;; checked to see if anything can ever use them.
+
+(define_insn "add<mode>3_ds_vector"
+ [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (vec_merge:DS_ARITH_MODE
+ (plus:DS_ARITH_MODE
+ (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" "%RD")
+ (match_operand:DS_ARITH_MODE 2 "register_operand" " v"))
+ (match_operand:DS_ARITH_MODE 4 "gcn_register_ds_or_unspec_operand"
+ " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_add%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "add<mode>3_ds_scalar"
+ [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (plus:DS_ARITH_SCALAR_MODE
+ (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+ "%RD")
+ (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e"))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_add%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds_vector"
+ [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (vec_merge:DS_ARITH_MODE
+ (minus:DS_ARITH_MODE
+ (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD")
+ (match_operand:DS_ARITH_MODE 2 "register_operand" " v"))
+ (match_operand:DS_ARITH_MODE 4 "gcn_register_ds_or_unspec_operand"
+ " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_sub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds_scalar"
+ [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (minus:DS_ARITH_SCALAR_MODE
+ (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+ " RD")
+ (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e"))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_sub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds_vector"
+ [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (vec_merge:DS_ARITH_MODE
+ (minus:DS_ARITH_MODE
+ (match_operand:DS_ARITH_MODE 2 "register_operand" " v")
+ (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD"))
+ (match_operand:DS_ARITH_MODE 4 "gcn_register_ds_or_unspec_operand"
+ " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_rsub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds_scalar"
+ [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (minus:DS_ARITH_SCALAR_MODE
+ (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")
+ (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+ " RD")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e"))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_rsub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU special case: mult
+
+(define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")])
+(define_code_attr su [(sign_extend "s") (zero_extend "u")])
+(define_code_attr u [(sign_extend "") (zero_extend "u")])
+(define_code_attr iu [(sign_extend "i") (zero_extend "u")])
+(define_code_attr e [(sign_extend "e") (zero_extend "")])
+
+(define_expand "<su>mulsi3_highpart"
+ [(parallel [(set (match_operand:SI 0 "register_operand")
+ (truncate:SI
+ (lshiftrt:DI
+ (mult:DI
+ (any_extend:DI
+ (match_operand:SI 1 "register_operand"))
+ (any_extend:DI
+ (match_operand:SI 2 "gcn_vop3_operand")))
+ (const_int 32))))
+ (use (match_dup 3))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec_reg ();
+
+ if (CONST_INT_P (operands[2]))
+ {
+ emit_insn (gen_const_<su>mulsi3_highpart_scalar (operands[0],
+ operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+ }
+ })
+
+(define_insn "<su>mulv64si3_highpart_vector"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (vec_merge:V64SI
+ (truncate:V64SI
+ (lshiftrt:V64DI
+ (mult:V64DI
+ (any_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " %v"))
+ (any_extend:V64DI
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSSB")))
+ (const_int 32)))
+ (match_operand:V64SI 4 "gcn_register_ds_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "<su>mulsi3_highpart_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= v")
+ (truncate:SI
+ (lshiftrt:DI
+ (mult:DI
+ (any_extend:DI
+ (match_operand:SI 1 "register_operand" "% v"))
+ (any_extend:DI
+ (match_operand:SI 2 "register_operand" "vSS")))
+ (const_int 32))))
+ (use (match_operand:DI 3 "gcn_exec_reg_operand" " e"))]
+ ""
+ "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "const_<su>mulsi3_highpart_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=v")
+ (truncate:SI
+ (lshiftrt:DI
+ (mult:DI
+ (any_extend:DI
+ (match_operand:SI 1 "register_operand" "%v"))
+ (match_operand:SI 2 "gcn_vop3_operand" " A"))
+ (const_int 32))))
+ (use (match_operand:DI 3 "gcn_exec_reg_operand" " e"))]
+ ""
+ "v_mul_hi<sgnsuffix>0\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_expand "<u>mulhisi3"
+ [(parallel [(set (match_operand:SI 0 "register_operand")
+ (mult:SI
+ (any_extend:SI (match_operand:HI 1 "register_operand"))
+ (any_extend:SI (match_operand:HI 2 "register_operand"))))
+ (use (match_dup 3))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec_reg ();
+ })
+
+(define_insn "<u>mulhisi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=v")
+ (mult:SI
+ (any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
+ (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))
+ (use (match_operand:DI 3 "gcn_exec_reg_operand" " e"))]
+ ""
+ "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
+ [(set_attr "type" "vop_sdwa")
+ (set_attr "length" "8")])
+
+(define_expand "<u>mulqihi3"
+ [(parallel [(set (match_operand:HI 0 "register_operand")
+ (mult:HI
+ (any_extend:HI (match_operand:QI 1 "register_operand"))
+ (any_extend:HI (match_operand:QI 2 "register_operand"))))
+ (use (match_dup 3))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec_reg ();
+ })
+
+(define_insn "<u>mulqihi3_scalar"
+ [(set (match_operand:HI 0 "register_operand" "=v")
+ (mult:HI
+ (any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
+ (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))
+ (use (match_operand:DI 3 "gcn_exec_reg_operand" " e"))]
+ ""
+ "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
+ [(set_attr "type" "vop_sdwa")
+ (set_attr "length" "8")])
+
+(define_expand "mulv64si3"
+ [(set (match_operand:V64SI 0 "register_operand")
+ (vec_merge:V64SI
+ (mult:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand")
+ (match_operand:V64SI 2 "gcn_alu_operand"))
+ (match_dup 4)
+ (match_dup 3)))]
+ ""
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (V64SImode);
+ })
+
+(define_insn "mulv64si3_vector"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (vec_merge:V64SI
+ (mult:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA")
+ (match_operand:V64SI 2 "gcn_alu_operand" " vSvA"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_mul_lo_u32\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "mulv64si3_vector_dup"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (vec_merge:V64SI
+ (mult:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA")
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" " SvA")))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_mul_lo_u32\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_expand "mulv64di3"
+ [(match_operand:V64DI 0 "register_operand")
+ (match_operand:V64DI 1 "gcn_alu_operand")
+ (match_operand:V64DI 2 "gcn_alu_operand")]
+ ""
+ {
+ emit_insn (gen_mulv64di3_vector (operands[0], operands[1], operands[2],
+ gcn_full_exec_reg (),
+ gcn_gen_undef (V64DImode)));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_vector"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (vec_merge:V64DI
+ (mult:V64DI
+ (match_operand:V64DI 1 "gcn_alu_operand" "% v")
+ (match_operand:V64DI 2 "gcn_alu_operand" "vDA"))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0);
+ rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1);
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx exec = operands[3];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[4]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (V64SImode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (V64DImode, operands[4], 0);
+ old_hi = gcn_operand_part (V64DImode, operands[4], 1);
+ }
+
+ rtx undef = gcn_gen_undef (V64SImode);
+
+ emit_insn (gen_mulv64si3_vector (out_lo, left_lo, right_lo, exec, old_lo));
+ emit_insn (gen_umulv64si3_highpart_vector (out_hi, left_lo, right_lo,
+ exec, old_hi));
+ emit_insn (gen_mulv64si3_vector (tmp, left_hi, right_lo, exec, undef));
+ emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+ emit_insn (gen_mulv64si3_vector (tmp, left_lo, right_hi, exec, undef));
+ emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+ emit_insn (gen_mulv64si3_vector (tmp, left_hi, right_hi, exec, undef));
+ emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_vector_zext"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (vec_merge:V64DI
+ (mult:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v"))
+ (match_operand:V64DI 2 "gcn_alu_operand" "vDA"))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left = operands[1];
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx exec = operands[3];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[4]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (V64SImode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (V64DImode, operands[4], 0);
+ old_hi = gcn_operand_part (V64DImode, operands[4], 1);
+ }
+
+ rtx undef = gcn_gen_undef (V64SImode);
+
+ emit_insn (gen_mulv64si3_vector (out_lo, left, right_lo, exec, old_lo));
+ emit_insn (gen_umulv64si3_highpart_vector (out_hi, left, right_lo,
+ exec, old_hi));
+ emit_insn (gen_mulv64si3_vector (tmp, left, right_hi, exec, undef));
+ emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_vector_zext_dup2"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (vec_merge:V64DI
+ (mult:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v"))
+ (vec_duplicate:V64DI
+ (match_operand:DI 2 "gcn_alu_operand" "SSDA")))
+ (match_operand:V64DI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "= &v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left = operands[1];
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx exec = operands[3];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[4]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (V64SImode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (V64DImode, operands[4], 0);
+ old_hi = gcn_operand_part (V64DImode, operands[4], 1);
+ }
+
+ rtx undef = gcn_gen_undef (V64SImode);
+
+ emit_insn (gen_mulv64si3_vector (out_lo, left, right_lo, exec, old_lo));
+ emit_insn (gen_umulv64si3_highpart_vector (out_hi, left, right_lo,
+ exec, old_hi));
+ emit_insn (gen_mulv64si3_vector (tmp, left, right_hi, exec, undef));
+ emit_insn (gen_addv64si3_vector (out_hi, out_hi, tmp, exec, out_hi));
+ DONE;
+ })
+
+;; }}}
+;; {{{ ALU generic case
+
+(define_mode_iterator VEC_INT_MODE [V64QI V64HI V64SI V64DI])
+
+(define_code_iterator bitop [and ior xor])
+(define_code_iterator bitunop [not popcount])
+(define_code_iterator shiftop [ashift lshiftrt ashiftrt])
+(define_code_iterator minmaxop [smin smax umin umax])
+
+(define_expand "<expander><mode>3"
+ [(set (match_operand:VEC_INT_MODE 0 "gcn_valu_dst_operand")
+ (vec_merge:VEC_INT_MODE
+ (bitop:VEC_INT_MODE
+ (match_operand:VEC_INT_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:VEC_INT_MODE 2 "gcn_valu_src1com_operand"))
+ (match_dup 4)
+ (match_dup 3)))]
+ ""
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_expand "<expander>v64si3"
+ [(set (match_operand:V64SI 0 "register_operand")
+ (vec_merge:V64SI
+ (shiftop:V64SI
+ (match_operand:V64SI 1 "register_operand")
+ (match_operand:SI 2 "gcn_alu_operand"))
+ (match_dup 4)
+ (match_dup 3)))]
+ ""
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (V64SImode);
+ })
+
+(define_expand "v<expander>v64si3"
+ [(set (match_operand:V64SI 0 "register_operand")
+ (vec_merge:V64SI
+ (shiftop:V64SI
+ (match_operand:V64SI 1 "register_operand")
+ (match_operand:V64SI 2 "gcn_alu_operand"))
+ (match_dup 4)
+ (match_dup 3)))]
+ ""
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (V64SImode);
+ })
+
+(define_expand "<expander><mode>3"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand")
+ (vec_merge:VEC_1REG_INT_MODE
+ (minmaxop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1_operand"))
+ (match_dup 4)
+ (match_dup 3)))]
+ "<MODE>mode != V64QImode"
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_insn "<expander><mode>2_vector"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v")
+ (vec_merge:VEC_1REG_INT_MODE
+ (bitunop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "vSSB"))
+ (match_operand:VEC_1REG_INT_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_<mnemonic>0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_vector"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v,RD")
+ (vec_merge:VEC_1REG_INT_MODE
+ (bitop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "% v, 0")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+ "vSSB, v"))
+ (match_operand:VEC_1REG_INT_MODE 4
+ "gcn_register_ds_or_unspec_operand" " U0,U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))]
+ "!memory_operand (operands[0], VOIDmode)
+ || (rtx_equal_p (operands[0], operands[1])
+ && register_operand (operands[2], VOIDmode))"
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ ds_<mnemonic>0\t%A0, %2%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8,8")])
+
+(define_insn "<expander><mode>2_vscalar"
+ [(set (match_operand:SCALAR_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v")
+ (bitunop:SCALAR_1REG_INT_MODE
+ (match_operand:SCALAR_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "vSSB")))
+ (use (match_operand:DI 2 "gcn_exec_operand" " e"))]
+ ""
+ "v_<mnemonic>0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_scalar"
+ [(set (match_operand:SCALAR_1REG_INT_MODE 0 "gcn_valu_dst_operand"
+ "= v,RD")
+ (vec_and_scalar_com:SCALAR_1REG_INT_MODE
+ (match_operand:SCALAR_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "% v, 0")
+ (match_operand:SCALAR_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+ "vSSB, v")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e, e"))]
+ "!memory_operand (operands[0], VOIDmode)
+ || (rtx_equal_p (operands[0], operands[1])
+ && register_operand (operands[2], VOIDmode))"
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ ds_<mnemonic>0\t%A0, %2%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "<expander>v64di3_vector"
+ [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD")
+ (vec_merge:V64DI
+ (bitop:V64DI
+ (match_operand:V64DI 1 "gcn_valu_src0_operand" "% v,RD")
+ (match_operand:V64DI 2 "gcn_valu_src1com_operand" "vSSB, v"))
+ (match_operand:V64DI 4 "gcn_register_ds_or_unspec_operand"
+ " U0,U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))]
+ "!memory_operand (operands[0], VOIDmode)
+ || (rtx_equal_p (operands[0], operands[1])
+ && register_operand (operands[2], VOIDmode))"
+ "@
+ #
+ ds_<mnemonic>0\t%A0, %2%O0"
+ "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))"
+ [(set (match_dup 5)
+ (vec_merge:V64SI
+ (bitop:V64SI (match_dup 7) (match_dup 9))
+ (match_dup 11)
+ (match_dup 3)))
+ (set (match_dup 6)
+ (vec_merge:V64SI
+ (bitop:V64SI (match_dup 8) (match_dup 10))
+ (match_dup 12)
+ (match_dup 3)))]
+ {
+ operands[5] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[6] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[7] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[8] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[9] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[10] = gcn_operand_part (V64DImode, operands[2], 1);
+ operands[11] = gcn_operand_part (V64DImode, operands[4], 0);
+ operands[12] = gcn_operand_part (V64DImode, operands[4], 1);
+ }
+ [(set_attr "type" "vmult,ds")
+ (set_attr "length" "16,8")])
+
+(define_insn_and_split "<expander>di3_scalar"
+ [(set (match_operand:DI 0 "gcn_valu_dst_operand" "= &v,RD")
+ (bitop:DI
+ (match_operand:DI 1 "gcn_valu_src0_operand" "% v,RD")
+ (match_operand:DI 2 "gcn_valu_src1com_operand" "vSSB, v")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e, e"))]
+ "!memory_operand (operands[0], VOIDmode)
+ || (rtx_equal_p (operands[0], operands[1])
+ && register_operand (operands[2], VOIDmode))"
+ "@
+ #
+ ds_<mnemonic>0\t%A0, %2%O0"
+ "(reload_completed && !gcn_ds_memory_operand (operands[0], DImode))"
+ [(parallel [(set (match_dup 4)
+ (bitop:V64SI (match_dup 6) (match_dup 8)))
+ (use (match_dup 3))])
+ (parallel [(set (match_dup 5)
+ (bitop:V64SI (match_dup 7) (match_dup 9)))
+ (use (match_dup 3))])]
+ {
+ operands[4] = gcn_operand_part (DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (DImode, operands[0], 1);
+ operands[6] = gcn_operand_part (DImode, operands[1], 0);
+ operands[7] = gcn_operand_part (DImode, operands[1], 1);
+ operands[8] = gcn_operand_part (DImode, operands[2], 0);
+ operands[9] = gcn_operand_part (DImode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult,ds")
+ (set_attr "length" "16,8")])
+
+(define_insn "<expander>v64si3_vector"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (vec_merge:V64SI
+ (shiftop:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v")
+ (match_operand:SI 2 "gcn_alu_operand" "SSB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "v<expander>v64si3_vector"
+ [(set (match_operand:V64SI 0 "register_operand" "=v")
+ (vec_merge:V64SI
+ (shiftop:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vB"))
+ (match_operand:V64SI 4 "gcn_register_or_unspec_operand" "U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander>v64si3_full"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+ (shiftop:V64SI (match_operand:V64SI 1 "register_operand" " v,v")
+ (match_operand:SI 2 "nonmemory_operand" "Sg,I")))]
+ ""
+ "@
+ v_<revmnemonic>0\t%0, %2, %1
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "4")
+ (set_attr "exec" "full")])
+
+(define_insn "*<expander>si3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= v")
+ (shiftop:SI
+ (match_operand:SI 1 "gcn_alu_operand" " v")
+ (match_operand:SI 2 "gcn_alu_operand" "vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e"))]
+ ""
+ "v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_vector"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v,RD")
+ (vec_merge:VEC_1REG_INT_MODE
+ (minmaxop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "% v, 0")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+ "vSSB, v"))
+ (match_operand:VEC_1REG_INT_MODE 4
+ "gcn_register_ds_or_unspec_operand" " U0,U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))]
+ "<MODE>mode != V64QImode
+ && (!memory_operand (operands[0], VOIDmode)
+ || (rtx_equal_p (operands[0], operands[1])
+ && register_operand (operands[2], VOIDmode)))"
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ ds_<mnemonic>0\t%A0, %2%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - special cases
+
+; GCN does not directly provide a DFmode subtract instruction, so we do it by
+; adding the negated second operand to the first.
+
+(define_insn "subv64df3_vector"
+ [(set (match_operand:V64DF 0 "register_operand" "= v, v")
+ (vec_merge:V64DF
+ (minus:V64DF
+ (match_operand:V64DF 1 "gcn_alu_operand" "vSSB, v")
+ (match_operand:V64DF 2 "gcn_alu_operand" " v,vSSB"))
+ (match_operand:V64DF 4 "gcn_register_or_unspec_operand"
+ " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))]
+ ""
+ "@
+ v_add_f64\t%0, %1, -%2
+ v_add_f64\t%0, -%2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8,8")])
+
+(define_insn "subdf_scalar"
+ [(set (match_operand:DF 0 "register_operand" "= v, v")
+ (minus:DF
+ (match_operand:DF 1 "gcn_alu_operand" "vSSB, v")
+ (match_operand:DF 2 "gcn_alu_operand" " v,vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e, e"))]
+ ""
+ "@
+ v_add_f64\t%0, %1, -%2
+ v_add_f64\t%0, -%2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - generic
+
+(define_mode_iterator VEC_FP_MODE [V64HF V64SF V64DF])
+(define_mode_iterator VEC_FP_1REG_MODE [V64HF V64SF])
+(define_mode_iterator FP_MODE [HF SF DF])
+(define_mode_iterator FP_1REG_MODE [HF SF])
+
+(define_code_iterator comm_fp [plus mult smin smax])
+(define_code_iterator nocomm_fp [minus])
+(define_code_iterator all_fp [plus mult minus smin smax])
+
+(define_insn "<expander><mode>3_vector"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_FP_MODE
+ (comm_fp:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% v")
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "vSSB"))
+ (match_operand:VEC_FP_MODE 4 "gcn_register_or_unspec_operand"
+ " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_<mnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_scalar"
+ [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand" "= v, RL")
+ (comm_fp:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_valu_src0_operand" "% v, 0")
+ (match_operand:FP_MODE 2 "gcn_valu_src1_operand" "vSSB,vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e, e"))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ v_<mnemonic>0\t%0, %1%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3_vector"
+ [(set (match_operand:VEC_FP_1REG_MODE 0 "register_operand" "= v, v")
+ (vec_merge:VEC_FP_1REG_MODE
+ (nocomm_fp:VEC_FP_1REG_MODE
+ (match_operand:VEC_FP_1REG_MODE 1 "gcn_alu_operand" "vSSB, v")
+ (match_operand:VEC_FP_1REG_MODE 2 "gcn_alu_operand" " v,vSSB"))
+ (match_operand:VEC_FP_1REG_MODE 4 "gcn_register_or_unspec_operand"
+ " U0, U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e, e")))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %1, %2
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+(define_insn "<expander><mode>3_scalar"
+ [(set (match_operand:FP_1REG_MODE 0 "register_operand" "= v, v")
+ (nocomm_fp:FP_1REG_MODE
+ (match_operand:FP_1REG_MODE 1 "gcn_alu_operand" "vSSB, v")
+ (match_operand:FP_1REG_MODE 2 "gcn_alu_operand" " v,vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e, e"))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %1, %2
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+(define_expand "<expander><mode>3"
+ [(set (match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+ (vec_merge:VEC_FP_MODE
+ (all_fp:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:VEC_FP_MODE 2 "gcn_valu_src1_operand"))
+ (match_dup 4)
+ (match_dup 3)))]
+ ""
+ {
+ operands[3] = gcn_full_exec_reg ();
+ operands[4] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_expand "<expander><mode>3"
+ [(parallel [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+ (all_fp:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:FP_MODE 2 "gcn_valu_src1_operand")))
+ (use (match_dup 3))])]
+ ""
+ {
+ operands[3] = gcn_scalar_exec ();
+ })
+
+;; }}}
+;; {{{ FP unops
+
+(define_insn "abs<mode>2"
+ [(set (match_operand:FP_MODE 0 "register_operand" "=v")
+ (abs:FP_MODE (match_operand:FP_MODE 1 "register_operand" " v")))]
+ ""
+ "v_add%i0\t%0, 0, |%1|"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_expand "abs<mode>2"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand")
+ (abs:VEC_FP_MODE (match_operand:VEC_FP_MODE 1 "register_operand")))]
+ ""
+ {
+ emit_insn (gen_abs<mode>2_vector (operands[0], operands[1],
+ gcn_full_exec_reg (),
+ gcn_gen_undef (<MODE>mode)));
+ DONE;
+ })
+
+(define_insn "abs<mode>2_vector"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_FP_MODE
+ (abs:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "register_operand" " v"))
+ (match_operand:VEC_FP_MODE 3 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_add%i0\t%0, 0, |%1|"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_expand "neg<mode>2"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand")
+ (neg:VEC_FP_MODE (match_operand:VEC_FP_MODE 1 "register_operand")))]
+ ""
+ {
+ emit_insn (gen_neg<mode>2_vector (operands[0], operands[1],
+ gcn_full_exec_reg (),
+ gcn_gen_undef (<MODE>mode)));
+ DONE;
+ })
+
+(define_insn "neg<mode>2_vector"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_FP_MODE
+ (neg:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "register_operand" " v"))
+ (match_operand:VEC_FP_MODE 3 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_add%i0\t%0, 0, -%1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>_vector"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_FP_MODE
+ (sqrt:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "vSSB"))
+ (match_operand:VEC_FP_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (match_operand:DI 2 "gcn_exec_reg_operand" " e")))]
+ "flag_unsafe_math_optimizations"
+ "v_sqrt%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>_scalar"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v")
+ (sqrt:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand" "vSSB")))
+ (use (match_operand:DI 2 "gcn_exec_operand" " e"))]
+ "flag_unsafe_math_optimizations"
+ "v_sqrt%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand")
+ (vec_merge:VEC_FP_MODE
+ (sqrt:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"))
+ (match_dup 3)
+ (match_dup 2)))]
+ "flag_unsafe_math_optimizations"
+ {
+ operands[2] = gcn_full_exec_reg ();
+ operands[3] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_expand "sqrt<mode>2"
+ [(parallel [(set (match_operand:FP_MODE 0 "register_operand")
+ (sqrt:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand")))
+ (use (match_dup 2))])]
+ "flag_unsafe_math_optimizations"
+ {
+ operands[2] = gcn_scalar_exec ();
+ })
+
+;; }}}
+;; {{{ FP fused multiply and add
+
+(define_insn "fma<mode>_vector"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v, v")
+ (vec_merge:VEC_FP_MODE
+ (fma:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" " vA,vSSA")
+ (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSSA, vA"))
+ (match_operand:VEC_FP_MODE 5 "gcn_register_or_unspec_operand"
+ " U0, U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e")))]
+ ""
+ "v_fma%i0\t%0, %1, %2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fma<mode>_vector_negop2"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v, v, v")
+ (vec_merge:VEC_FP_MODE
+ (fma:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" " vA, vA,vSSA")
+ (neg:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand"
+ " vA,vSSA, vA"))
+ (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSSA, vA, vA"))
+ (match_operand:VEC_FP_MODE 5 "gcn_register_or_unspec_operand"
+ " U0, U0, U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e")))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fma<mode>_scalar"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v, v")
+ (fma:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:FP_MODE 2 "gcn_alu_operand" " vA,vSSA")
+ (match_operand:FP_MODE 3 "gcn_alu_operand" "vSSA, vA")))
+ (use (match_operand:DI 4 "gcn_exec_operand" " e, e"))]
+ ""
+ "v_fma%i0\t%0, %1, %2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fma<mode>_scalar_negop2"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v, v, v")
+ (fma:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand" " vA, vA,vSSA")
+ (neg:FP_MODE
+ (match_operand:FP_MODE 2 "gcn_alu_operand" " vA,vSSA, vA"))
+ (match_operand:FP_MODE 3 "gcn_alu_operand" "vSSA, vA, vA")))
+ (use (match_operand:DI 4 "gcn_exec_operand" " e, e, e"))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_expand "fma<mode>4"
+ [(set (match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+ (vec_merge:VEC_FP_MODE
+ (fma:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_valu_src1_operand")
+ (match_operand:VEC_FP_MODE 2 "gcn_valu_src1_operand")
+ (match_operand:VEC_FP_MODE 3 "gcn_valu_src1_operand"))
+ (match_dup 5)
+ (match_dup 4)))]
+ ""
+ {
+ operands[4] = gcn_full_exec_reg ();
+ operands[5] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_expand "fma<mode>4_negop2"
+ [(set (match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+ (vec_merge:VEC_FP_MODE
+ (fma:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_valu_src1_operand")
+ (neg:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 2 "gcn_valu_src1_operand"))
+ (match_operand:VEC_FP_MODE 3 "gcn_valu_src1_operand"))
+ (match_dup 5)
+ (match_dup 4)))]
+ ""
+ {
+ operands[4] = gcn_full_exec_reg ();
+ operands[5] = gcn_gen_undef (<MODE>mode);
+ })
+
+(define_expand "fma<mode>4"
+ [(parallel [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+ (fma:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_valu_src1_operand")
+ (match_operand:FP_MODE 2 "gcn_valu_src1_operand")
+ (match_operand:FP_MODE 3 "gcn_valu_src1_operand")))
+ (use (match_dup 4))])]
+ ""
+ {
+ operands[4] = gcn_scalar_exec ();
+ })
+
+(define_expand "fma<mode>4_negop2"
+ [(parallel [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+ (fma:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_valu_src1_operand")
+ (neg:FP_MODE
+ (match_operand:FP_MODE 2 "gcn_valu_src1_operand"))
+ (match_operand:FP_MODE 3 "gcn_valu_src1_operand")))
+ (use (match_dup 4))])]
+ ""
+ {
+ operands[4] = gcn_scalar_exec ();
+ })
+
+;; }}}
+;; {{{ FP division
+
+(define_insn "recip<mode>_vector"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_FP_MODE
+ (div:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_vec1d_operand" " A")
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "vSSB"))
+ (match_operand:VEC_FP_MODE 4 "gcn_register_or_unspec_operand"
+ " U0")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")))]
+ ""
+ "v_rcp%i0\t%0, %2"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "recip<mode>_scalar"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v")
+ (div:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_const1d_operand" " A")
+ (match_operand:FP_MODE 2 "gcn_alu_operand" "vSSB")))
+ (use (match_operand:DI 3 "gcn_exec_operand" " e"))]
+ ""
+ "v_rcp%i0\t%0, %2"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+;; Do division via a = b * 1/c
+;; The v_rcp_* instructions are not sufficiently accurate on their own,
+;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
+;; which the ISA manual says is enough to improve the reciprocal accuracy.
+;;
+;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+
+(define_expand "div<mode>3"
+ [(match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+ (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:VEC_FP_MODE 2 "gcn_valu_src0_operand")]
+ "flag_reciprocal_math"
+ {
+ rtx one = gcn_vec_constant (<MODE>mode,
+ const_double_from_real_value (dconst1, <SCALAR_MODE>mode));
+ rtx two = gcn_vec_constant (<MODE>mode,
+ const_double_from_real_value (dconst2, <SCALAR_MODE>mode));
+ rtx initrcp = gen_reg_rtx (<MODE>mode);
+ rtx fma = gen_reg_rtx (<MODE>mode);
+ rtx rcp;
+
+ bool is_rcp = (GET_CODE (operands[1]) == CONST_VECTOR
+ && real_identical
+ (CONST_DOUBLE_REAL_VALUE
+ (CONST_VECTOR_ELT (operands[1], 0)), &dconstm1));
+
+ if (is_rcp)
+ rcp = operands[0];
+ else
+ rcp = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_recip<mode>_vector (initrcp, one, operands[2],
+ gcn_full_exec_reg (),
+ gcn_gen_undef (<MODE>mode)));
+ emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+ emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+ if (!is_rcp)
+ emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+ DONE;
+ })
+
+(define_expand "div<mode>3"
+ [(match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+ (match_operand:FP_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:FP_MODE 2 "gcn_valu_src0_operand")]
+ "flag_reciprocal_math"
+ {
+ rtx one = const_double_from_real_value (dconst1, <MODE>mode);
+ rtx two = const_double_from_real_value (dconst2, <MODE>mode);
+ rtx initrcp = gen_reg_rtx (<MODE>mode);
+ rtx fma = gen_reg_rtx (<MODE>mode);
+ rtx rcp;
+
+ bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
+ && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
+ &dconstm1));
+
+ if (is_rcp)
+ rcp = operands[0];
+ else
+ rcp = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_recip<mode>_scalar (initrcp, one, operands[2],
+ gcn_scalar_exec ()));
+ emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+ emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+ if (!is_rcp)
+ emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+ DONE;
+ })
+
+;; }}}
+;; {{{ Int/FP conversions
+
+(define_mode_iterator CVT_FROM_MODE [HI SI HF SF DF])
+(define_mode_iterator CVT_TO_MODE [HI SI HF SF DF])
+(define_mode_iterator CVT_F_MODE [HF SF DF])
+(define_mode_iterator CVT_I_MODE [HI SI])
+
+(define_mode_iterator VCVT_FROM_MODE [V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_TO_MODE [V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_F_MODE [V64HF V64SF V64DF])
+(define_mode_iterator VCVT_I_MODE [V64HI V64SI])
+
+(define_code_iterator cvt_op [fix unsigned_fix
+ float unsigned_float
+ float_extend float_truncate])
+(define_code_attr cvt_name [(fix "fix_trunc") (unsigned_fix "fixuns_trunc")
+ (float "float") (unsigned_float "floatuns")
+ (float_extend "extend") (float_truncate "trunc")])
+(define_code_attr cvt_operands [(fix "%i0%i1") (unsigned_fix "%u0%i1")
+ (float "%i0%i1") (unsigned_float "%i0%u1")
+ (float_extend "%i0%i1")
+ (float_truncate "%i0%i1")])
+
+(define_expand "<cvt_name><CVT_FROM_MODE:mode><CVT_F_MODE:mode>2"
+ [(parallel [(set (match_operand:CVT_F_MODE 0 "register_operand")
+ (cvt_op:CVT_F_MODE
+ (match_operand:CVT_FROM_MODE 1 "gcn_valu_src0_operand")))
+ (use (match_dup 2))])]
+ "gcn_valid_cvt_p (<CVT_FROM_MODE:MODE>mode, <CVT_F_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ {
+ operands[2] = gcn_scalar_exec ();
+ })
+
+(define_expand "<cvt_name><VCVT_FROM_MODE:mode><VCVT_F_MODE:mode>2"
+ [(set (match_operand:VCVT_F_MODE 0 "register_operand")
+ (vec_merge:VCVT_F_MODE
+ (cvt_op:VCVT_F_MODE
+ (match_operand:VCVT_FROM_MODE 1 "gcn_valu_src0_operand"))
+ (match_dup 3)
+ (match_dup 2)))]
+ "gcn_valid_cvt_p (<VCVT_FROM_MODE:MODE>mode, <VCVT_F_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ {
+ operands[2] = gcn_full_exec_reg ();
+ operands[3] = gcn_gen_undef (<VCVT_F_MODE:MODE>mode);
+ })
+
+(define_expand "<cvt_name><CVT_F_MODE:mode><CVT_I_MODE:mode>2"
+ [(parallel [(set (match_operand:CVT_I_MODE 0 "register_operand")
+ (cvt_op:CVT_I_MODE
+ (match_operand:CVT_F_MODE 1 "gcn_valu_src0_operand")))
+ (use (match_dup 2))])]
+ "gcn_valid_cvt_p (<CVT_F_MODE:MODE>mode, <CVT_I_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ {
+ operands[2] = gcn_scalar_exec ();
+ })
+
+(define_expand "<cvt_name><VCVT_F_MODE:mode><VCVT_I_MODE:mode>2"
+ [(set (match_operand:VCVT_I_MODE 0 "register_operand")
+ (vec_merge:VCVT_I_MODE
+ (cvt_op:VCVT_I_MODE
+ (match_operand:VCVT_F_MODE 1 "gcn_valu_src0_operand"))
+ (match_dup 3)
+ (match_dup 2)))]
+ "gcn_valid_cvt_p (<VCVT_F_MODE:MODE>mode, <VCVT_I_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ {
+ operands[2] = gcn_full_exec_reg ();
+ operands[3] = gcn_gen_undef (<VCVT_I_MODE:MODE>mode);
+ })
+
+(define_insn "<cvt_name><CVT_FROM_MODE:mode><CVT_TO_MODE:mode>2_insn"
+ [(set (match_operand:CVT_TO_MODE 0 "register_operand" "= v")
+ (cvt_op:CVT_TO_MODE
+ (match_operand:CVT_FROM_MODE 1 "gcn_alu_operand" "vSSB")))
+ (use (match_operand:DI 2 "gcn_exec_operand" " e"))]
+ "gcn_valid_cvt_p (<CVT_FROM_MODE:MODE>mode, <CVT_TO_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ "v_cvt<cvt_operands>\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<cvt_name><VCVT_FROM_MODE:mode><VCVT_TO_MODE:mode>2_insn"
+ [(set (match_operand:VCVT_TO_MODE 0 "register_operand" "= v")
+ (vec_merge:VCVT_TO_MODE
+ (cvt_op:VCVT_TO_MODE
+ (match_operand:VCVT_FROM_MODE 1 "gcn_alu_operand" "vSSB"))
+ (match_operand:VCVT_TO_MODE 2 "gcn_alu_or_unspec_operand" " U0")
+ (match_operand:DI 3 "gcn_exec_operand" " e")))]
+ "gcn_valid_cvt_p (<VCVT_FROM_MODE:MODE>mode, <VCVT_TO_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ "v_cvt<cvt_operands>\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Int/int conversions
+
+;; GCC can already do these for scalar types, but not for vector types.
+;; Unfortunately you can't just do SUBREG on a vector to select the low part,
+;; so there must be a few tricks here.
+
+(define_insn_and_split "vec_truncatev64div64si"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,&v")
+ (vec_merge:V64SI
+ (truncate:V64SI
+ (match_operand:V64DI 1 "register_operand" " 0, v"))
+ (match_operand:V64SI 2 "gcn_alu_or_unspec_operand" "U0,U0")
+ (match_operand:DI 3 "gcn_exec_operand" " e, e")))]
+ ""
+ "#"
+ "reload_completed"
+ [(parallel [(set (match_dup 0)
+ (vec_merge:V64SI (match_dup 1) (match_dup 2) (match_dup 3)))
+ (clobber (scratch:V64DI))])]
+ {
+ operands[1] = gcn_operand_part (V64SImode, operands[1], 0);
+ }
+ [(set_attr "type" "vop2")
+ (set_attr "length" "0,4")])
+
+;; }}}
+;; {{{ Vector comparison/merge
+
+(define_expand "vec_cmp<mode>di"
+ [(parallel
+ [(set (match_operand:DI 0 "register_operand")
+ (and:DI
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand")])
+ (match_dup 4)))
+ (clobber (match_scratch:DI 5))])]
+ ""
+ {
+ operands[4] = gcn_full_exec_reg ();
+ })
+
+(define_expand "vec_cmpu<mode>di"
+ [(parallel
+ [(set (match_operand:DI 0 "register_operand")
+ (and:DI
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+ (match_operand:VEC_1REG_INT_MODE 3 "gcn_vop3_operand")])
+ (match_dup 4)))
+ (clobber (match_scratch:DI 5))])]
+ ""
+ {
+ operands[4] = gcn_full_exec_reg ();
+ })
+
+(define_insn "vec_cmp<mode>di_insn"
+ [(set (match_operand:DI 0 "register_operand" "=cV,cV, e, e,Sg,Sg")
+ (and:DI
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand"
+ "vSS, B,vSS, B, v,vA")
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+ " v, v, v, v,vA, v")])
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e, e, e, e")))
+ (clobber (match_scratch:DI 5 "= X, X, cV,cV, X, X"))]
+ ""
+ "@
+ v_cmp%E1\tvcc, %2, %3
+ v_cmp%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmp%E1\t%0, %2, %3
+ v_cmp%E1\t%0, %2, %3"
+ [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a")
+ (set_attr "length" "4,8,4,8,8,8")])
+
+(define_insn "vec_cmp<mode>di_dup"
+ [(set (match_operand:DI 0 "register_operand" "=cV,cV, e,e,Sg")
+ (and:DI
+ (match_operator 1 "comparison_operator"
+ [(vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand"
+ " SS, B,SS,B, A"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+ " v, v, v,v, v")])
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e,e, e")))
+ (clobber (match_scratch:DI 5 "= X,X,cV,cV, X"))]
+ ""
+ "@
+ v_cmp%E1\tvcc, %2, %3
+ v_cmp%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmp%E1\t%0, %2, %3"
+ [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a")
+ (set_attr "length" "4,8,4,8,8")])
+
+(define_expand "vcond_mask_<mode>di"
+ [(parallel
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "")
+ (vec_merge:VEC_REG_MODE
+ (match_operand:VEC_REG_MODE 1 "gcn_vop3_operand" "")
+ (match_operand:VEC_REG_MODE 2 "gcn_alu_operand" "")
+ (match_operand:DI 3 "register_operand" "")))
+ (clobber (scratch:V64DI))])]
+ ""
+ "")
+
+(define_expand "vcond<VEC_1REG_MODE:mode><VEC_1REG_ALT:mode>"
+ [(match_operand:VEC_1REG_MODE 0 "register_operand")
+ (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand")
+ (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+ (match_operator 3 "comparison_operator"
+ [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand")
+ (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ rtx cmp_op = gen_rtx_fmt_ee (GET_CODE (operands[3]), DImode, operands[4],
+ operands[5]);
+ rtx set = gen_rtx_SET (tmp, gen_rtx_AND (DImode, cmp_op,
+ gcn_full_exec_reg ()));
+ rtx clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (DImode));
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ tmp));
+ DONE;
+ })
+
+
+(define_expand "vcondu<VEC_1REG_INT_MODE:mode><VEC_1REG_INT_ALT:mode>"
+ [(match_operand:VEC_1REG_INT_MODE 0 "register_operand")
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+ (match_operator 3 "comparison_operator"
+ [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand")
+ (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ rtx cmp_op = gen_rtx_fmt_ee (GET_CODE (operands[3]), DImode, operands[4],
+ operands[5]);
+ rtx set = gen_rtx_SET (tmp,
+ gen_rtx_AND (DImode, cmp_op, gcn_full_exec_reg ()));
+ rtx clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (DImode));
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ tmp));
+ DONE;
+ })
+
+;; }}}
+;; {{{ Fully masked loop support
+;;
+;; The autovectorizer requires the mask is a vector value (we use V64BImode),
+;; but the backend uses simple DImode for the same thing.
+;;
+;; There are two kinds of patterns here:
+;;
+;; 1) Expanders for masked vector operatoions (while_ult, maskload, etc.)
+;;
+;; 2) Expanders that convert general V64BImode operations to DImode
+;; equivalents.
+;
+(define_expand "while_ultsiv64bi"
+ [(match_operand:V64BI 0 "register_operand")
+ (match_operand:SI 1 "")
+ (match_operand:SI 2 "")]
+ ""
+ {
+ operands[0] = gcn_convert_mask_mode (operands[0]);
+
+ if (GET_CODE (operands[1]) != CONST_INT
+ || GET_CODE (operands[2]) != CONST_INT)
+ {
+ rtx exec = gcn_full_exec_reg ();
+ rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ rtx tmp = _0_1_2_3;
+ if (GET_CODE (operands[1]) != CONST_INT
+ || INTVAL (operands[1]) != 0)
+ {
+ tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_addv64si3_vector_dup (tmp, _0_1_2_3, operands[1],
+ exec, tmp));
+ }
+ emit_insn (gen_vec_cmpv64sidi_dup (operands[0],
+ gen_rtx_GT (VOIDmode, 0, 0),
+ operands[2], tmp, exec));
+ }
+ else
+ {
+ HOST_WIDE_INT diff = INTVAL (operands[2]) - INTVAL (operands[1]);
+ HOST_WIDE_INT mask = (diff >= 64 ? -1 : ~((HOST_WIDE_INT)-1 << diff));
+ emit_move_insn (operands[0], gen_rtx_CONST_INT (VOIDmode, mask));
+ }
+ DONE;
+ })
+
+(define_expand "cstorev64bi4"
+ [(match_operand:BI 0 "gcn_conditional_register_operand")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:V64BI 2 "gcn_alu_operand")
+ (match_operand:V64BI 3 "gcn_alu_operand")])]
+ ""
+ {
+ operands[2] = gcn_convert_mask_mode (operands[2]);
+ operands[3] = gcn_convert_mask_mode (operands[3]);
+
+ emit_insn (gen_cstoredi4 (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ })
+
+(define_expand "cbranchv64bi4"
+ [(match_operator 0 "gcn_compare_operator"
+ [(match_operand:SI 1 "")
+ (match_operand:SI 2 "")])
+ (match_operand 3)]
+ ""
+ {
+ operands[1] = gcn_convert_mask_mode (operands[1]);
+ operands[2] = gcn_convert_mask_mode (operands[2]);
+
+ emit_insn(gen_cbranchdi4 (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ })
+
+(define_expand "movv64bi"
+ [(set (match_operand:V64BI 0 "nonimmediate_operand")
+ (match_operand:V64BI 1 "general_operand"))]
+ ""
+ {
+ operands[0] = gcn_convert_mask_mode (operands[0]);
+ operands[1] = gcn_convert_mask_mode (operands[1]);
+ })
+
+(define_expand "vcond_mask_<mode>v64bi"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:VEC_REG_MODE 1 "register_operand")
+ (match_operand:VEC_REG_MODE 2 "register_operand")
+ (match_operand:V64BI 3 "register_operand")]
+ ""
+ {
+ operands[3] = gcn_convert_mask_mode (operands[3]);
+
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ })
+
+(define_expand "maskload<mode>v64bi"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:VEC_REG_MODE 1 "memory_operand")
+ (match_operand 2 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, gcn_convert_mask_mode (operands[2]));
+ rtx addr = gcn_expand_scalar_to_vector_address
+ (<MODE>mode, exec, operands[1], gen_rtx_SCRATCH (V64DImode));
+ rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ rtx undef = gcn_gen_undef (<MODE>mode);
+ emit_insn (gen_gather<mode>_expr (operands[0], addr, as, v, undef, exec));
+ DONE;
+ })
+
+(define_expand "maskstore<mode>v64bi"
+ [(match_operand:VEC_REG_MODE 0 "memory_operand")
+ (match_operand:VEC_REG_MODE 1 "register_operand")
+ (match_operand 2 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, gcn_convert_mask_mode (operands[2]));
+ rtx addr = gcn_expand_scalar_to_vector_address
+ (<MODE>mode, exec, operands[0], gen_rtx_SCRATCH (V64DImode));
+ rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ emit_insn (gen_scatter<mode>_expr (addr, operands[1], as, v, exec));
+ DONE;
+ })
+
+(define_expand "mask_gather_load<mode>"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")
+ (match_operand:V64BI 5 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, gcn_convert_mask_mode (operands[5]));
+
+ /* TODO: more conversions will be needed when more types are vectorized. */
+ if (GET_MODE (operands[2]) == V64DImode)
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_vec_truncatev64div64si (tmp, operands[2],
+ gcn_gen_undef (V64SImode),
+ exec));
+ operands[2] = tmp;
+ }
+
+ emit_insn (gen_gather<mode>_exec (operands[0], operands[1], operands[2],
+ operands[3], operands[4], exec));
+ DONE;
+ })
+
+(define_expand "mask_scatter_store<mode>"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:VEC_REG_MODE 4 "register_operand")
+ (match_operand:V64BI 5 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, gcn_convert_mask_mode (operands[5]));
+
+ /* TODO: more conversions will be needed when more types are vectorized. */
+ if (GET_MODE (operands[1]) == V64DImode)
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_vec_truncatev64div64si (tmp, operands[1],
+ gcn_gen_undef (V64SImode),
+ exec));
+ operands[1] = tmp;
+ }
+
+ emit_insn (gen_scatter<mode>_exec (operands[0], operands[1], operands[2],
+ operands[3], operands[4], exec));
+ DONE;
+ })
+
+; FIXME this should be VEC_REG_MODE, but not all dependencies are implemented.
+(define_mode_iterator COND_MODE [V64SI V64DI V64SF V64DF])
+(define_mode_iterator COND_INT_MODE [V64SI V64DI])
+
+(define_code_iterator cond_op [plus minus])
+
+(define_expand "cond_<expander><mode>"
+ [(match_operand:COND_MODE 0 "register_operand")
+ (match_operand:V64BI 1 "register_operand")
+ (cond_op:COND_MODE
+ (match_operand:COND_MODE 2 "gcn_alu_operand")
+ (match_operand:COND_MODE 3 "gcn_alu_operand"))
+ (match_operand:COND_MODE 4 "register_operand")]
+ ""
+ {
+ operands[1] = force_reg (DImode, gcn_convert_mask_mode (operands[1]));
+ operands[2] = force_reg (<MODE>mode, operands[2]);
+
+ emit_insn (gen_<expander><mode>3_vector (operands[0], operands[2],
+ operands[3], operands[1],
+ operands[4]));
+ DONE;
+ })
+
+(define_code_iterator cond_bitop [and ior xor])
+
+(define_expand "cond_<expander><mode>"
+ [(match_operand:COND_INT_MODE 0 "register_operand")
+ (match_operand:V64BI 1 "register_operand")
+ (cond_bitop:COND_INT_MODE
+ (match_operand:COND_INT_MODE 2 "gcn_alu_operand")
+ (match_operand:COND_INT_MODE 3 "gcn_alu_operand"))
+ (match_operand:COND_INT_MODE 4 "register_operand")]
+ ""
+ {
+ operands[1] = force_reg (DImode, gcn_convert_mask_mode (operands[1]));
+ operands[2] = force_reg (<MODE>mode, operands[2]);
+
+ emit_insn (gen_<expander><mode>3_vector (operands[0], operands[2],
+ operands[3], operands[1],
+ operands[4]));
+ DONE;
+ })
+
+(define_expand "vec_cmp<mode>v64bi"
+ [(match_operand:V64BI 0 "register_operand")
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand")])]
+ ""
+ {
+ operands[0] = gcn_convert_mask_mode (operands[0]);
+
+ emit_insn (gen_vec_cmp<mode>di (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ })
+
+(define_expand "vec_cmpu<mode>v64bi"
+ [(match_operand:V64BI 0 "register_operand")
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+ (match_operand:VEC_1REG_INT_MODE 3 "gcn_vop3_operand")])]
+ ""
+ {
+ operands[0] = gcn_convert_mask_mode (operands[0]);
+
+ emit_insn (gen_vec_cmpu<mode>di (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ })
+
+;; }}}
+;; {{{ Vector reductions
+
+(define_int_iterator REDUC_UNSPEC [UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+ UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+ UNSPEC_PLUS_DPP_SHR
+ UNSPEC_AND_DPP_SHR
+ UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+(define_int_iterator REDUC_2REG_UNSPEC [UNSPEC_PLUS_DPP_SHR
+ UNSPEC_AND_DPP_SHR
+ UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+; FIXME: Isn't there a better way of doing this?
+(define_int_attr reduc_unspec [(UNSPEC_SMIN_DPP_SHR "UNSPEC_SMIN_DPP_SHR")
+ (UNSPEC_SMAX_DPP_SHR "UNSPEC_SMAX_DPP_SHR")
+ (UNSPEC_UMIN_DPP_SHR "UNSPEC_UMIN_DPP_SHR")
+ (UNSPEC_UMAX_DPP_SHR "UNSPEC_UMAX_DPP_SHR")
+ (UNSPEC_PLUS_DPP_SHR "UNSPEC_PLUS_DPP_SHR")
+ (UNSPEC_AND_DPP_SHR "UNSPEC_AND_DPP_SHR")
+ (UNSPEC_IOR_DPP_SHR "UNSPEC_IOR_DPP_SHR")
+ (UNSPEC_XOR_DPP_SHR "UNSPEC_XOR_DPP_SHR")])
+
+(define_int_attr reduc_op [(UNSPEC_SMIN_DPP_SHR "smin")
+ (UNSPEC_SMAX_DPP_SHR "smax")
+ (UNSPEC_UMIN_DPP_SHR "umin")
+ (UNSPEC_UMAX_DPP_SHR "umax")
+ (UNSPEC_PLUS_DPP_SHR "plus")
+ (UNSPEC_AND_DPP_SHR "and")
+ (UNSPEC_IOR_DPP_SHR "ior")
+ (UNSPEC_XOR_DPP_SHR "xor")])
+
+(define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
+ (UNSPEC_SMAX_DPP_SHR "v_max%i0")
+ (UNSPEC_UMIN_DPP_SHR "v_min%u0")
+ (UNSPEC_UMAX_DPP_SHR "v_max%u0")
+ (UNSPEC_PLUS_DPP_SHR "v_add%u0")
+ (UNSPEC_AND_DPP_SHR "v_and%b0")
+ (UNSPEC_IOR_DPP_SHR "v_or%b0")
+ (UNSPEC_XOR_DPP_SHR "v_xor%b0")])
+
+(define_expand "reduc_<reduc_op>_scal_<mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
+ (unspec:<SCALAR_MODE>
+ [(match_operand:VEC_1REG_MODE 1 "register_operand")]
+ REDUC_UNSPEC))]
+ ""
+ {
+ rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
+ <reduc_unspec>);
+
+ /* The result of the reduction is in lane 63 of tmp. */
+ emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+
+ DONE;
+ })
+
+(define_expand "reduc_<reduc_op>_scal_v64di"
+ [(set (match_operand:DI 0 "register_operand")
+ (unspec:DI
+ [(match_operand:V64DI 1 "register_operand")]
+ REDUC_2REG_UNSPEC))]
+ ""
+ {
+ rtx tmp = gcn_expand_reduc_scalar (V64DImode, operands[1],
+ <reduc_unspec>);
+
+ /* The result of the reduction is in lane 63 of tmp. */
+ emit_insn (gen_mov_from_lane63_v64di (operands[0], tmp));
+
+ DONE;
+ })
+
+(define_insn "*<reduc_op>_dpp_shr_<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_1REG_MODE
+ [(match_operand:VEC_1REG_MODE 1 "register_operand" "v")
+ (match_operand:VEC_1REG_MODE 2 "register_operand" "v")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ REDUC_UNSPEC))]
+ "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
+ && <reduc_unspec> == UNSPEC_PLUS_DPP_SHR)"
+ {
+ return gcn_expand_dpp_shr_insn (<MODE>mode, "<reduc_insn>",
+ <reduc_unspec>, INTVAL (operands[3]));
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "exec" "full")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "*<reduc_op>_dpp_shr_v64di"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (unspec:V64DI
+ [(match_operand:V64DI 1 "register_operand" "v0")
+ (match_operand:V64DI 2 "register_operand" "v0")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ REDUC_2REG_UNSPEC))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 4)
+ (unspec:V64SI
+ [(match_dup 6) (match_dup 8) (match_dup 3)] REDUC_2REG_UNSPEC))
+ (set (match_dup 5)
+ (unspec:V64SI
+ [(match_dup 7) (match_dup 9) (match_dup 3)] REDUC_2REG_UNSPEC))]
+ {
+ operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "exec" "full")
+ (set_attr "length" "16")])
+
+; Special cases for addition.
+
+(define_insn "*plus_carry_dpp_shr_<mode>"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "register_operand" "=v")
+ (unspec:VEC_1REG_INT_MODE
+ [(match_operand:VEC_1REG_INT_MODE 1 "register_operand" "v")
+ (match_operand:VEC_1REG_INT_MODE 2 "register_operand" "v")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ UNSPEC_PLUS_CARRY_DPP_SHR))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ {
+ const char *insn = TARGET_GCN3 ? "v_add%u0" : "v_add_co%u0";
+ return gcn_expand_dpp_shr_insn (<MODE>mode, insn,
+ UNSPEC_PLUS_CARRY_DPP_SHR,
+ INTVAL (operands[3]));
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "exec" "full")
+ (set_attr "length" "8")])
+
+(define_insn "*plus_carry_in_dpp_shr_v64si"
+ [(set (match_operand:V64SI 0 "register_operand" "=v")
+ (unspec:V64SI
+ [(match_operand:V64SI 1 "register_operand" "v")
+ (match_operand:V64SI 2 "register_operand" "v")
+ (match_operand:SI 3 "const_int_operand" "n")
+ (match_operand:DI 4 "register_operand" "cV")]
+ UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ {
+ const char *insn = TARGET_GCN3 ? "v_addc%u0" : "v_addc_co%u0";
+ return gcn_expand_dpp_shr_insn (V64SImode, insn,
+ UNSPEC_PLUS_CARRY_IN_DPP_SHR,
+ INTVAL (operands[3]));
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "exec" "full")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "*plus_carry_dpp_shr_v64di"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (unspec:V64DI
+ [(match_operand:V64DI 1 "register_operand" "v0")
+ (match_operand:V64DI 2 "register_operand" "v0")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ UNSPEC_PLUS_CARRY_DPP_SHR))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "reload_completed"
+ [(parallel [(set (match_dup 4)
+ (unspec:V64SI
+ [(match_dup 6) (match_dup 8) (match_dup 3)]
+ UNSPEC_PLUS_CARRY_DPP_SHR))
+ (clobber (reg:DI VCC_REG))])
+ (parallel [(set (match_dup 5)
+ (unspec:V64SI
+ [(match_dup 7) (match_dup 9) (match_dup 3) (reg:DI VCC_REG)]
+ UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+ (clobber (reg:DI VCC_REG))])]
+ {
+ operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "exec" "full")
+ (set_attr "length" "16")])
+
+; Instructions to move a scalar value from lane 63 of a vector register.
+(define_insn "mov_from_lane63_<mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
+ (unspec:<SCALAR_MODE>
+ [(match_operand:VEC_1REG_MODE 1 "register_operand" "v,v")]
+ UNSPEC_MOV_FROM_LANE63))]
+ ""
+ "@
+ v_readlane_b32\t%0, %1, 63
+ v_mov_b32\t%0, %1 wave_ror:1"
+ [(set_attr "type" "vop3a,vop_dpp")
+ (set_attr "exec" "*,full")
+ (set_attr "length" "8")])
+
+(define_insn "mov_from_lane63_v64di"
+ [(set (match_operand:DI 0 "register_operand" "=Sg,v")
+ (unspec:DI
+ [(match_operand:V64DI 1 "register_operand" "v,v")]
+ UNSPEC_MOV_FROM_LANE63))]
+ ""
+ "@
+ v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
+ * if (REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\" \
+ \"v_mov_b32\t%H0, %H1 wave_ror:1\"; \
+ else \
+ return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\" \
+ \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
+ [(set_attr "type" "vop3a,vop_dpp")
+ (set_attr "exec" "*,full")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Miscellaneous
+
+(define_expand "vec_seriesv64si"
+ [(match_operand:V64SI 0 "register_operand")
+ (match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ rtx undef = gcn_gen_undef (V64SImode);
+ rtx exec = gcn_full_exec_reg ();
+
+ emit_insn (gen_mulv64si3_vector_dup (tmp, v1, operands[2], exec, undef));
+ emit_insn (gen_addv64si3_vector_dup (operands[0], tmp, operands[1], exec,
+ undef));
+ DONE;
+ })
+
+(define_expand "vec_seriesv64di"
+ [(match_operand:V64DI 0 "register_operand")
+ (match_operand:DI 1 "gcn_alu_operand")
+ (match_operand:DI 2 "gcn_alu_operand")]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (V64DImode);
+ rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ rtx undef = gcn_gen_undef (V64DImode);
+ rtx exec = gcn_full_exec_reg ();
+
+ emit_insn (gen_mulv64di3_vector_zext_dup2 (tmp, v1, operands[2], exec,
+ undef));
+ emit_insn (gen_addv64di3_vector_dup (operands[0], tmp, operands[1], exec,
+ undef));
+ DONE;
+ })
+
+;; }}}