[OpenACC,2/11] PTX backend changes

Message ID	5630DAE0.1020000@acm.org
State	New
Headers	show Return-Path: <gcc-patches-return-411818-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :subject:to:references:cc:from:message-id:date:mime-version :in-reply-to:content-type; q=dns; s=default; b=SYCYr9uutKqP86ZuW xeLVHz1IuqG1bsW85WhaN7zEmvlLWpK5+LwGLrltCwrmcS7Zh1tL6bdWBDm8mA+B HNYZbe8419xCGwj7ZUP7Jm8pXmAnrmootNODAabANzDKpL6JJZ+QIGbGxBdHAciE aZZlMZVuLqMC/p1vo80QMQh7BE= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Subject: Re: [OpenACC 2/11] PTX backend changes To: Jakub Jelinek <jakub@redhat.com>, Bernd Schmidt <bschmidt@redhat.com> References: <5627DD78.9040302@acm.org> <5627E303.5050701@acm.org> <5628ECE5.6080901@redhat.com> <5628F1A9.50803@acm.org> <5628F281.7070309@redhat.com> <20151022143215.GT478@tucnak.redhat.com> <5628F6EC.3010605@acm.org> Cc: GCC Patches <gcc-patches@gcc.gnu.org>, Jason Merrill <jason@redhat.com>, "Joseph S. Myers" <joseph@codesourcery.com> From: Nathan Sidwell <nathan@acm.org> Message-ID: <5630DAE0.1020000@acm.org> Date: Wed, 28 Oct 2015 07:25:36 -0700 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0 MIME-Version: 1.0 In-Reply-To: <5628F6EC.3010605@acm.org> Content-Type: multipart/mixed; boundary="------------030303030201050501090606"

2015-10-28 Nathan Sidwell <nathan@codesourcery.com> * config/nvptx/nvptx.h (struct machine_function): Add axis_predicate. * config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork, nvptx_expand_oacc_join): Declare. * config/nvptx/nvptx.md (UNSPEC_NTID, UNSPEC_TID): Delete. (UNSPEC_DIM_SIZE, UNSPEC_SHARED_DATA, UNSPEC_BIT_CONV, UNSPEC_SHUFFLE, UNSPEC_BR_UNIFIED): New. (UNSPECV_BARSYNC, UNSPECV_DIM_POS, UNSPECV_FORK, UNSPECV_FORKED, UNSPECV_JOINING, UNSPECV_JOIN): New. (BITS, BITD): New mode iterators. (br_true_uni, br_false_uni): New. (*oacc_ntid_insn, oacc_ntid, *oacc_tid_insn, oacc_tid): Delete. (oacc_dim_size, oacc_dim_pos): New. (nvptx_fork, nvptx_forked, nvptx_joining, nvptx_join): New. (oacc_fork, oacc_join): New. (nvptx_shuffle<mode>, unpack<mode>si2, packsi<mode>2): New. (worker_load<mode>, worker_store<mode>): New. (nvptx_barsync): New. * config/nvptx/nvptx.c: Include gimple.h & dumpfile.h. (SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY, SHUFFLE_IDX): Define. (worker_bcast_hwm, worker_bcast_align, worker_bcast_name, worker_bcast_sym): New. (nvptx_option_override): Initialize worker broadcast buffer. (nvptx_emit_forking, nvptx_emit_joining): New. (nvptx_init_axis_predicate): New. (nvptx_declare_function_name): Init axis predicates. (nvptx_expand_call): Add fork/join markers around routine call. (nvptx_expand_oacc_fork, nvptx_expand_oacc_join): New. (nvptx_gen_unpack, nvptx_gen_pack, nvptx_gen_shuffle): New. (nvptx_gen_vcast): New. (struct wcast_data_t): New. (enum propagate_mask): New. (nvptx_gen_wcast): New. (nvptx_print_operand): Add 'S' case. (struct parallel): New. (parallel::parallel, parallel::~parallel): New. (bb_insn_map_t, insn_bb_t, insn_bb_vec_t): New typedefs. (nvptx_split_blocks, nvptx_discover_pre, nvptx_dump_pars, nvptx_find_par, nvptx_discover_pars): New. (nvptx_propagate): New. (vprop_gen, nvptx_vpropagate): New. (wprop_gen, nvptx_wpropagate): New. (nvptx_wsync): New. (nvptx_single, nvptx_skip_par): New. (nvptx_process_pars, nvptx_neuter_pars): New. (ntptx_reorg): Split blocks, generate parallel structure, apply neutering. (nvptx_cannot_copy_insn_p): New. (nvptx_file_end): Emit worker broadcast decl. (nvptx_goacc_fork_join): New. (TARGET_CANNOT_COPY_INSN_P): Override. (TARGET_GOACC_FORK_JOIN): Override. Index: gcc/config/nvptx/nvptx-protos.h =================================================================== --- gcc/config/nvptx/nvptx-protos.h (revision 229472) +++ gcc/config/nvptx/nvptx-protos.h (working copy) @@ -32,6 +32,8 @@ extern void nvptx_register_pragmas (void extern const char *nvptx_section_for_decl (const_tree); #ifdef RTX_CODE +extern void nvptx_expand_oacc_fork (unsigned); +extern void nvptx_expand_oacc_join (unsigned); extern void nvptx_expand_call (rtx, rtx); extern rtx nvptx_expand_compare (rtx); extern const char *nvptx_ptx_type_from_mode (machine_mode, bool); Index: gcc/config/nvptx/nvptx.md =================================================================== --- gcc/config/nvptx/nvptx.md (revision 229472) +++ gcc/config/nvptx/nvptx.md (working copy) @@ -49,14 +49,27 @@ UNSPEC_ALLOCA - UNSPEC_NTID - UNSPEC_TID + UNSPEC_DIM_SIZE + + UNSPEC_SHARED_DATA + + UNSPEC_BIT_CONV + + UNSPEC_SHUFFLE + UNSPEC_BR_UNIFIED ]) (define_c_enum "unspecv" [ UNSPECV_LOCK UNSPECV_CAS UNSPECV_XCHG + UNSPECV_BARSYNC + UNSPECV_DIM_POS + + UNSPECV_FORK + UNSPECV_FORKED + UNSPECV_JOINING + UNSPECV_JOIN ]) (define_attr "subregs_ok" "false,true" @@ -246,6 +259,8 @@ (define_mode_iterator QHSIM [QI HI SI]) (define_mode_iterator SDFM [SF DF]) (define_mode_iterator SDCM [SC DC]) +(define_mode_iterator BITS [SI SF]) +(define_mode_iterator BITD [DI DF]) ;; This mode iterator allows :P to be used for patterns that operate on ;; pointer-sized quantities. Exactly one of the two alternatives will match. @@ -817,6 +832,23 @@ "" "%J0\\tbra\\t%l1;") +;; unified conditional branch +(define_insn "br_true_uni" + [(set (pc) (if_then_else + (ne (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")] + UNSPEC_BR_UNIFIED) (const_int 0)) + (label_ref (match_operand 1 "" "")) (pc)))] + "" + "%j0\\tbra.uni\\t%l1;") + +(define_insn "br_false_uni" + [(set (pc) (if_then_else + (eq (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")] + UNSPEC_BR_UNIFIED) (const_int 0)) + (label_ref (match_operand 1 "" "")) (pc)))] + "" + "%J0\\tbra.uni\\t%l1;") + (define_expand "cbranch<mode>4" [(set (pc) (if_then_else (match_operator 0 "nvptx_comparison_operator" @@ -1308,36 +1340,134 @@ DONE; }) -(define_insn "*oacc_ntid_insn" - [(set (match_operand:SI 0 "nvptx_register_operand" "=R") - (unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_NTID))] +(define_insn "oacc_dim_size" + [(set (match_operand:SI 0 "nvptx_register_operand" "") + (unspec:SI [(match_operand:SI 1 "const_int_operand" "")] + UNSPEC_DIM_SIZE))] "" - "%.\\tmov.u32 %0, %%ntid%d1;") +{ + static const char *const asms[] = +{ /* Must match oacc_loop_levels ordering. */ + "%.\\tmov.u32\\t%0, %%nctaid.x;", /* gang */ + "%.\\tmov.u32\\t%0, %%ntid.y;", /* worker */ + "%.\\tmov.u32\\t%0, %%ntid.x;", /* vector */ +}; + return asms[INTVAL (operands[1])]; +}) -(define_expand "oacc_ntid" +(define_insn "oacc_dim_pos" [(set (match_operand:SI 0 "nvptx_register_operand" "") - (unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NTID))] + (unspec_volatile:SI [(match_operand:SI 1 "const_int_operand" "")] + UNSPECV_DIM_POS))] "" { - if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2) - FAIL; + static const char *const asms[] = +{ /* Must match oacc_loop_levels ordering. */ + "%.\\tmov.u32\\t%0, %%ctaid.x;", /* gang */ + "%.\\tmov.u32\\t%0, %%tid.y;", /* worker */ + "%.\\tmov.u32\\t%0, %%tid.x;", /* vector */ +}; + return asms[INTVAL (operands[1])]; }) -(define_insn "*oacc_tid_insn" - [(set (match_operand:SI 0 "nvptx_register_operand" "=R") - (unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_TID))] +(define_insn "nvptx_fork" + [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] + UNSPECV_FORK)] "" - "%.\\tmov.u32 %0, %%tid%d1;") + "// fork %0;" +) -(define_expand "oacc_tid" - [(set (match_operand:SI 0 "nvptx_register_operand" "") - (unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_TID))] +(define_insn "nvptx_forked" + [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] + UNSPECV_FORKED)] + "" + "// forked %0;" +) + +(define_insn "nvptx_joining" + [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] + UNSPECV_JOINING)] + "" + "// joining %0;" +) + +(define_insn "nvptx_join" + [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")] + UNSPECV_JOIN)] + "" + "// join %0;" +) + +(define_expand "oacc_fork" + [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "") + (match_operand:SI 1 "nvptx_general_operand" "")) + (unspec_volatile:SI [(match_operand:SI 2 "const_int_operand" "")] + UNSPECV_FORKED)] "" { - if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2) - FAIL; + if (operands[0] != const0_rtx) + emit_move_insn (operands[0], operands[1]); + nvptx_expand_oacc_fork (INTVAL (operands[2])); + DONE; }) +(define_expand "oacc_join" + [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "") + (match_operand:SI 1 "nvptx_general_operand" "")) + (unspec_volatile:SI [(match_operand:SI 2 "const_int_operand" "")] + UNSPECV_JOIN)] + "" +{ + if (operands[0] != const0_rtx) + emit_move_insn (operands[0], operands[1]); + nvptx_expand_oacc_join (INTVAL (operands[2])); + DONE; +}) + +;; only 32-bit shuffles exist. +(define_insn "nvptx_shuffle<mode>" + [(set (match_operand:BITS 0 "nvptx_register_operand" "=R") + (unspec:BITS + [(match_operand:BITS 1 "nvptx_register_operand" "R") + (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri") + (match_operand:SI 3 "const_int_operand" "n")] + UNSPEC_SHUFFLE))] + "" + "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;") + +;; extract parts of a 64 bit object into 2 32-bit ints +(define_insn "unpack<mode>si2" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R") + (const_int 0)] UNSPEC_BIT_CONV)) + (set (match_operand:SI 1 "nvptx_register_operand" "=R") + (unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))] + "" + "%.\\tmov.b64\\t{%0,%1}, %2;") + +;; pack 2 32-bit ints into a 64 bit object +(define_insn "packsi<mode>2" + [(set (match_operand:BITD 0 "nvptx_register_operand" "=R") + (unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R") + (match_operand:SI 2 "nvptx_register_operand" "R")] + UNSPEC_BIT_CONV))] + "" + "%.\\tmov.b64\\t%0, {%1,%2};") + +(define_insn "worker_load<mode>" + [(set (match_operand:SDISDFM 0 "nvptx_register_operand" "=R") + (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "m")] + UNSPEC_SHARED_DATA))] + "" + "%.\\tld.shared%u0\\t%0, %1;") + +(define_insn "worker_store<mode>" + [(set (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "=m")] + UNSPEC_SHARED_DATA) + (match_operand:SDISDFM 0 "nvptx_register_operand" "R"))] + "" + "%.\\tst.shared%u1\\t%1, %0;") + ;; Atomic insns. (define_expand "atomic_compare_and_swap<mode>" @@ -1423,3 +1553,9 @@ (match_dup 1))] "0" "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;") + +(define_insn "nvptx_barsync" + [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")] + UNSPECV_BARSYNC)] + "" + "\\tbar.sync\\t%0;") Index: gcc/config/nvptx/nvptx.c =================================================================== --- gcc/config/nvptx/nvptx.c (revision 229472) +++ gcc/config/nvptx/nvptx.c (working copy) @@ -51,14 +51,21 @@ #include "langhooks.h" #include "dbxout.h" #include "cfgrtl.h" +#include "gimple.h" #include "stor-layout.h" #include "builtins.h" #include "omp-low.h" #include "gomp-constants.h" +#include "dumpfile.h" /* This file should be included last. */ #include "target-def.h" +#define SHUFFLE_UP 0 +#define SHUFFLE_DOWN 1 +#define SHUFFLE_BFLY 2 +#define SHUFFLE_IDX 3 + /* Record the function decls we've written, and the libfuncs and function decls corresponding to them. */ static std::stringstream func_decls; @@ -81,6 +88,16 @@ struct tree_hasher : ggc_cache_ptr_hash< static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; +/* Size of buffer needed to broadcast across workers. This is used + for both worker-neutering and worker broadcasting. It is shared + by all functions emitted. The buffer is placed in shared memory. + It'd be nice if PTX supported common blocks, because then this + could be shared across TUs (taking the largest size). */ +static unsigned worker_bcast_size; +static unsigned worker_bcast_align; +#define worker_bcast_name "__worker_bcast" +static GTY(()) rtx worker_bcast_sym; + /* Allocate a new, cleared machine_function structure. */ static struct machine_function * @@ -108,6 +125,9 @@ nvptx_option_override (void) needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); declared_libfuncs_htab = hash_table<declared_libfunc_hasher>::create_ggc (17); + + worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name); + worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; } /* Return the mode to be used when declaring a ptx object for OBJ. @@ -194,6 +214,47 @@ nvptx_split_reg_p (machine_mode mode) return false; } +/* Emit forking instructions for MASK. */ + +static void +nvptx_emit_forking (unsigned mask, bool is_call) +{ + mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) + | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); + if (mask) + { + rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); + + /* Emit fork at all levels. This helps form SESE regions, as + it creates a block with a single successor before entering a + partitooned region. That is a good candidate for the end of + an SESE region. */ + if (!is_call) + emit_insn (gen_nvptx_fork (op)); + emit_insn (gen_nvptx_forked (op)); + } +} + +/* Emit joining instructions for MASK. */ + +static void +nvptx_emit_joining (unsigned mask, bool is_call) +{ + mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) + | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); + if (mask) + { + rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); + + /* Emit joining for all non-call pars to ensure there's a single + predecessor for the block the join insn ends up in. This is + needed for skipping entire loops. */ + if (!is_call) + emit_insn (gen_nvptx_joining (op)); + emit_insn (gen_nvptx_join (op)); + } +} + #define PASS_IN_REG_P(MODE, TYPE) \ ((GET_MODE_CLASS (MODE) == MODE_INT \ || GET_MODE_CLASS (MODE) == MODE_FLOAT \ @@ -500,6 +561,19 @@ nvptx_record_needed_fndecl (tree decl) *slot = decl; } +/* Emit code to initialize the REGNO predicate register to indicate + whether we are not lane zero on the NAME axis. */ + +static void +nvptx_init_axis_predicate (FILE *file, int regno, const char *name) +{ + fprintf (file, "\t{\n"); + fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); + fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); + fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); + fprintf (file, "\t}\n"); +} + /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx function, including local var decls and copies from the arguments to local regs. */ @@ -623,6 +697,14 @@ nvptx_declare_function_name (FILE *file, if (stdarg_p (fntype)) fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n", GET_MODE_BITSIZE (Pmode)); + + /* Emit axis predicates. */ + if (cfun->machine->axis_predicate[0]) + nvptx_init_axis_predicate (file, + REGNO (cfun->machine->axis_predicate[0]), "y"); + if (cfun->machine->axis_predicate[1]) + nvptx_init_axis_predicate (file, + REGNO (cfun->machine->axis_predicate[1]), "x"); } /* Output a return instruction. Also copy the return value to its outgoing @@ -779,6 +861,7 @@ nvptx_expand_call (rtx retval, rtx addre bool external_decl = false; rtx varargs = NULL_RTX; tree decl_type = NULL_TREE; + unsigned parallel = 0; for (t = cfun->machine->call_args; t; t = XEXP (t, 1)) nargs++; @@ -799,6 +882,22 @@ nvptx_expand_call (rtx retval, rtx addre cfun->machine->has_call_with_sc = true; if (DECL_EXTERNAL (decl)) external_decl = true; + tree attr = get_oacc_fn_attrib (decl); + if (attr) + { + tree dims = TREE_VALUE (attr); + + parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1; + for (int ix = 0; ix != GOMP_DIM_MAX; ix++) + { + if (TREE_PURPOSE (dims) + && !integer_zerop (TREE_PURPOSE (dims))) + break; + /* Not on this axis. */ + parallel ^= GOMP_DIM_MASK (ix); + dims = TREE_CHAIN (dims); + } + } } } @@ -860,7 +959,11 @@ nvptx_expand_call (rtx retval, rtx addre write_func_decl_from_insn (func_decls, retval, pat, callee); } } + + nvptx_emit_forking (parallel, true); emit_call_insn (pat); + nvptx_emit_joining (parallel, true); + if (tmp_retval != retval) emit_move_insn (retval, tmp_retval); } @@ -1069,6 +1172,214 @@ nvptx_expand_compare (rtx compare) return gen_rtx_NE (BImode, pred, const0_rtx); } +/* Expand the oacc fork & join primitive into ptx-required unspecs. */ + +void +nvptx_expand_oacc_fork (unsigned mode) +{ + nvptx_emit_forking (GOMP_DIM_MASK (mode), false); +} + +void +nvptx_expand_oacc_join (unsigned mode) +{ + nvptx_emit_joining (GOMP_DIM_MASK (mode), false); +} + +/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit + objects. */ + +static rtx +nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src) +{ + rtx res; + + switch (GET_MODE (src)) + { + case DImode: + res = gen_unpackdisi2 (dst0, dst1, src); + break; + case DFmode: + res = gen_unpackdfsi2 (dst0, dst1, src); + break; + default: gcc_unreachable (); + } + return res; +} + +/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit + object. */ + +static rtx +nvptx_gen_pack (rtx dst, rtx src0, rtx src1) +{ + rtx res; + + switch (GET_MODE (dst)) + { + case DImode: + res = gen_packsidi2 (dst, src0, src1); + break; + case DFmode: + res = gen_packsidf2 (dst, src0, src1); + break; + default: gcc_unreachable (); + } + return res; +} + +/* Generate an instruction or sequence to broadcast register REG + across the vectors of a single warp. */ + +static rtx +nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind) +{ + rtx res; + + switch (GET_MODE (dst)) + { + case SImode: + res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind)); + break; + case SFmode: + res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind)); + break; + case DImode: + case DFmode: + { + rtx tmp0 = gen_reg_rtx (SImode); + rtx tmp1 = gen_reg_rtx (SImode); + + start_sequence (); + emit_insn (nvptx_gen_unpack (tmp0, tmp1, src)); + emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); + emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); + emit_insn (nvptx_gen_pack (dst, tmp0, tmp1)); + res = get_insns (); + end_sequence (); + } + break; + case BImode: + { + rtx tmp = gen_reg_rtx (SImode); + + start_sequence (); + emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx)); + emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); + emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx))); + res = get_insns (); + end_sequence (); + } + break; + + default: + gcc_unreachable (); + } + return res; +} + +/* Generate an instruction or sequence to broadcast register REG + across the vectors of a single warp. */ + +static rtx +nvptx_gen_vcast (rtx reg) +{ + return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); +} + +/* Structure used when generating a worker-level spill or fill. */ + +struct wcast_data_t +{ + rtx base; /* Register holding base addr of buffer. */ + rtx ptr; /* Iteration var, if needed. */ + unsigned offset; /* Offset into worker buffer. */ +}; + +/* Direction of the spill/fill and looping setup/teardown indicator. */ + +enum propagate_mask + { + PM_read = 1 << 0, + PM_write = 1 << 1, + PM_loop_begin = 1 << 2, + PM_loop_end = 1 << 3, + + PM_read_write = PM_read | PM_write + }; + +/* Generate instruction(s) to spill or fill register REG to/from the + worker broadcast array. PM indicates what is to be done, REP + how many loop iterations will be executed (0 for not a loop). */ + +static rtx +nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data) +{ + rtx res; + machine_mode mode = GET_MODE (reg); + + switch (mode) + { + case BImode: + { + rtx tmp = gen_reg_rtx (SImode); + + start_sequence (); + if (pm & PM_read) + emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); + emit_insn (nvptx_gen_wcast (tmp, pm, rep, data)); + if (pm & PM_write) + emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); + res = get_insns (); + end_sequence (); + } + break; + + default: + { + rtx addr = data->ptr; + + if (!addr) + { + unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; + + if (align > worker_bcast_align) + worker_bcast_align = align; + data->offset = (data->offset + align - 1) & ~(align - 1); + addr = data->base; + if (data->offset) + addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); + } + + addr = gen_rtx_MEM (mode, addr); + addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA); + if (pm == PM_read) + res = gen_rtx_SET (addr, reg); + else if (pm == PM_write) + res = gen_rtx_SET (reg, addr); + else + gcc_unreachable (); + + if (data->ptr) + { + /* We're using a ptr, increment it. */ + start_sequence (); + + emit_insn (res); + emit_insn (gen_adddi3 (data->ptr, data->ptr, + GEN_INT (GET_MODE_SIZE (GET_MODE (reg))))); + res = get_insns (); + end_sequence (); + } + else + rep = 1; + data->offset += rep * GET_MODE_SIZE (GET_MODE (reg)); + } + break; + } + return res; +} + /* When loading an operand ORIG_OP, verify whether an address space conversion to generic is required, and if so, perform it. Also check for SYMBOL_REFs for function decls and call @@ -1660,6 +1971,7 @@ nvptx_print_operand_address (FILE *file, c -- print an opcode suffix for a comparison operator, including a type code d -- print a CONST_INT as a vector dimension (x, y, or z) f -- print a full reg even for something that must always be split + S -- print a shuffle kind specified by CONST_INT t -- print a type opcode suffix, promoting QImode to 32 bits T -- print a type size in bits u -- print a type opcode suffix without promotions. */ @@ -1723,6 +2035,15 @@ nvptx_print_operand (FILE *file, rtx x, fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false)); break; + case 'S': + { + unsigned kind = UINTVAL (x); + static const char *const kinds[] = + {"up", "down", "bfly", "idx"}; + fprintf (file, ".%s", kinds[kind]); + } + break; + case 'T': fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x))); break; @@ -1973,10 +2294,747 @@ nvptx_reorg_subreg (void) } } +/* Loop structure of the function. The entire function is described as + a NULL loop. We should be able to extend this to represent + superblocks. */ + +struct parallel +{ + /* Parent parallel. */ + parallel *parent; + + /* Next sibling parallel. */ + parallel *next; + + /* First child parallel. */ + parallel *inner; + + /* Partitioning mask of the parallel. */ + unsigned mask; + + /* Partitioning used within inner parallels. */ + unsigned inner_mask; + + /* Location of parallel forked and join. The forked is the first + block in the parallel and the join is the first block after of + the partition. */ + basic_block forked_block; + basic_block join_block; + + rtx_insn *forked_insn; + rtx_insn *join_insn; + + rtx_insn *fork_insn; + rtx_insn *joining_insn; + + /* Basic blocks in this parallel, but not in child parallels. The + FORKED and JOINING blocks are in the partition. The FORK and JOIN + blocks are not. */ + auto_vec<basic_block> blocks; + +public: + parallel (parallel *parent, unsigned mode); + ~parallel (); +}; + +/* Constructor links the new parallel into it's parent's chain of + children. */ + +parallel::parallel (parallel *parent_, unsigned mask_) + :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) +{ + forked_block = join_block = 0; + forked_insn = join_insn = 0; + fork_insn = joining_insn = 0; + + if (parent) + { + next = parent->inner; + parent->inner = this; + } +} + +parallel::~parallel () +{ + delete inner; + delete next; +} + +/* Map of basic blocks to insns */ +typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t; + +/* A tuple of an insn of interest and the BB in which it resides. */ +typedef std::pair<rtx_insn *, basic_block> insn_bb_t; +typedef auto_vec<insn_bb_t> insn_bb_vec_t; + +/* Split basic blocks such that each forked and join unspecs are at + the start of their basic blocks. Thus afterwards each block will + have a single partitioning mode. We also do the same for return + insns, as they are executed by every thread. Return the + partitioning mode of the function as a whole. Populate MAP with + head and tail blocks. We also clear the BB visited flag, which is + used when finding partitions. */ + +static void +nvptx_split_blocks (bb_insn_map_t *map) +{ + insn_bb_vec_t worklist; + basic_block block; + rtx_insn *insn; + + /* Locate all the reorg instructions of interest. */ + FOR_ALL_BB_FN (block, cfun) + { + bool seen_insn = false; + + /* Clear visited flag, for use by parallel locator */ + block->flags &= ~BB_VISITED; + + FOR_BB_INSNS (block, insn) + { + if (!INSN_P (insn)) + continue; + switch (recog_memoized (insn)) + { + default: + seen_insn = true; + continue; + case CODE_FOR_nvptx_forked: + case CODE_FOR_nvptx_join: + break; + + case CODE_FOR_return: + /* We also need to split just before return insns, as + that insn needs executing by all threads, but the + block it is in probably does not. */ + break; + } + + if (seen_insn) + /* We've found an instruction that must be at the start of + a block, but isn't. Add it to the worklist. */ + worklist.safe_push (insn_bb_t (insn, block)); + else + /* It was already the first instruction. Just add it to + the map. */ + map->get_or_insert (block) = insn; + seen_insn = true; + } + } + + /* Split blocks on the worklist. */ + unsigned ix; + insn_bb_t *elt; + basic_block remap = 0; + for (ix = 0; worklist.iterate (ix, &elt); ix++) + { + if (remap != elt->second) + { + block = elt->second; + remap = block; + } + + /* Split block before insn. The insn is in the new block */ + edge e = split_block (block, PREV_INSN (elt->first)); + + block = e->dest; + map->get_or_insert (block) = elt->first; + } +} + +/* BLOCK is a basic block containing a head or tail instruction. + Locate the associated prehead or pretail instruction, which must be + in the single predecessor block. */ + +static rtx_insn * +nvptx_discover_pre (basic_block block, int expected) +{ + gcc_assert (block->preds->length () == 1); + basic_block pre_block = (*block->preds)[0]->src; + rtx_insn *pre_insn; + + for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn); + pre_insn = PREV_INSN (pre_insn)) + gcc_assert (pre_insn != BB_HEAD (pre_block)); + + gcc_assert (recog_memoized (pre_insn) == expected); + return pre_insn; +} + +/* Dump this parallel and all its inner parallels. */ + +static void +nvptx_dump_pars (parallel *par, unsigned depth) +{ + fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n", + depth, par->mask, + par->forked_block ? par->forked_block->index : -1, + par->join_block ? par->join_block->index : -1); + + fprintf (dump_file, " blocks:"); + + basic_block block; + for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) + fprintf (dump_file, " %d", block->index); + fprintf (dump_file, "\n"); + if (par->inner) + nvptx_dump_pars (par->inner, depth + 1); + + if (par->next) + nvptx_dump_pars (par->next, depth); +} + +/* If BLOCK contains a fork/join marker, process it to create or + terminate a loop structure. Add this block to the current loop, + and then walk successor blocks. */ + +static parallel * +nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) +{ + if (block->flags & BB_VISITED) + return par; + block->flags |= BB_VISITED; + + if (rtx_insn **endp = map->get (block)) + { + rtx_insn *end = *endp; + + /* This is a block head or tail, or return instruction. */ + switch (recog_memoized (end)) + { + case CODE_FOR_return: + /* Return instructions are in their own block, and we + don't need to do anything more. */ + return par; + + case CODE_FOR_nvptx_forked: + /* Loop head, create a new inner loop and add it into + our parent's child list. */ + { + unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); + + gcc_assert (mask); + par = new parallel (par, mask); + par->forked_block = block; + par->forked_insn = end; + if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) + && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + par->fork_insn + = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); + } + break; + + case CODE_FOR_nvptx_join: + /* A loop tail. Finish the current loop and return to + parent. */ + { + unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); + + gcc_assert (par->mask == mask); + par->join_block = block; + par->join_insn = end; + if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) + && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + par->joining_insn + = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); + par = par->parent; + } + break; + + default: + gcc_unreachable (); + } + } + + if (par) + /* Add this block onto the current loop's list of blocks. */ + par->blocks.safe_push (block); + else + /* This must be the entry block. Create a NULL parallel. */ + par = new parallel (0, 0); + + /* Walk successor blocks. */ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, block->succs) + nvptx_find_par (map, par, e->dest); + + return par; +} + +/* DFS walk the CFG looking for fork & join markers. Construct + loop structures as we go. MAP is a mapping of basic blocks + to head & tail markers, discovered when splitting blocks. This + speeds up the discovery. We rely on the BB visited flag having + been cleared when splitting blocks. */ + +static parallel * +nvptx_discover_pars (bb_insn_map_t *map) +{ + basic_block block; + + /* Mark exit blocks as visited. */ + block = EXIT_BLOCK_PTR_FOR_FN (cfun); + block->flags |= BB_VISITED; + + /* And entry block as not. */ + block = ENTRY_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + + parallel *par = nvptx_find_par (map, 0, block); + + if (dump_file) + { + fprintf (dump_file, "\nLoops\n"); + nvptx_dump_pars (par, 0); + fprintf (dump_file, "\n"); + } + + return par; +} + +/* Propagate live state at the start of a partitioned region. BLOCK + provides the live register information, and might not contain + INSN. Propagation is inserted just after INSN. RW indicates whether + we are reading and/or writing state. This + separation is needed for worker-level proppagation where we + essentially do a spill & fill. FN is the underlying worker + function to generate the propagation instructions for single + register. DATA is user data. + + We propagate the live register set and the entire frame. We could + do better by (a) propagating just the live set that is used within + the partitioned regions and (b) only propagating stack entries that + are used. The latter might be quite hard to determine. */ + +typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); + +static void +nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, + propagator_fn fn, void *data) +{ + bitmap live = DF_LIVE_IN (block); + bitmap_iterator iterator; + unsigned ix; + + /* Copy the frame array. */ + HOST_WIDE_INT fs = get_frame_size (); + if (fs) + { + rtx tmp = gen_reg_rtx (DImode); + rtx idx = NULL_RTX; + rtx ptr = gen_reg_rtx (Pmode); + rtx pred = NULL_RTX; + rtx_code_label *label = NULL; + + gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1))); + fs /= GET_MODE_SIZE (DImode); + /* Detect single iteration loop. */ + if (fs == 1) + fs = 0; + + start_sequence (); + emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx)); + if (fs) + { + idx = gen_reg_rtx (SImode); + pred = gen_reg_rtx (BImode); + label = gen_label_rtx (); + + emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); + /* Allow worker function to initialize anything needed. */ + rtx init = fn (tmp, PM_loop_begin, fs, data); + if (init) + emit_insn (init); + emit_label (label); + LABEL_NUSES (label)++; + emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1))); + } + if (rw & PM_read) + emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr))); + emit_insn (fn (tmp, rw, fs, data)); + if (rw & PM_write) + emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp)); + if (fs) + { + emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx))); + emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode)))); + emit_insn (gen_br_true_uni (pred, label)); + rtx fini = fn (tmp, PM_loop_end, fs, data); + if (fini) + emit_insn (fini); + emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx)); + } + emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp)); + emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr)); + rtx cpy = get_insns (); + end_sequence (); + insn = emit_insn_after (cpy, insn); + } + + /* Copy live registers. */ + EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) + { + rtx reg = regno_reg_rtx[ix]; + + if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) + { + rtx bcast = fn (reg, rw, 0, data); + + insn = emit_insn_after (bcast, insn); + } + } +} + +/* Worker for nvptx_vpropagate. */ + +static rtx +vprop_gen (rtx reg, propagate_mask pm, + unsigned ARG_UNUSED (count), void *ARG_UNUSED (data)) +{ + if (!(pm & PM_read_write)) + return 0; + + return nvptx_gen_vcast (reg); +} + +/* Propagate state that is live at start of BLOCK across the vectors + of a single warp. Propagation is inserted just after INSN. */ + +static void +nvptx_vpropagate (basic_block block, rtx_insn *insn) +{ + nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0); +} + +/* Worker for nvptx_wpropagate. */ + +static rtx +wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_) +{ + wcast_data_t *data = (wcast_data_t *)data_; + + if (pm & PM_loop_begin) + { + /* Starting a loop, initialize pointer. */ + unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT; + + if (align > worker_bcast_align) + worker_bcast_align = align; + data->offset = (data->offset + align - 1) & ~(align - 1); + + data->ptr = gen_reg_rtx (Pmode); + + return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset)); + } + else if (pm & PM_loop_end) + { + rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr); + data->ptr = NULL_RTX; + return clobber; + } + else + return nvptx_gen_wcast (reg, pm, rep, data); +} + +/* Spill or fill live state that is live at start of BLOCK. PRE_P + indicates if this is just before partitioned mode (do spill), or + just after it starts (do fill). Sequence is inserted just after + INSN. */ + +static void +nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) +{ + wcast_data_t data; + + data.base = gen_reg_rtx (Pmode); + data.offset = 0; + data.ptr = NULL_RTX; + + nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data); + if (data.offset) + { + /* Stuff was emitted, initialize the base pointer now. */ + rtx init = gen_rtx_SET (data.base, worker_bcast_sym); + emit_insn_after (init, insn); + + if (worker_bcast_size < data.offset) + worker_bcast_size = data.offset; + } +} + +/* Emit a worker-level synchronization barrier. We use different + markers for before and after synchronizations. */ + +static rtx +nvptx_wsync (bool after) +{ + return gen_nvptx_barsync (GEN_INT (after)); +} + +/* Single neutering according to MASK. FROM is the incoming block and + TO is the outgoing block. These may be the same block. Insert at + start of FROM: + + if (tid.<axis>) goto end. + + and insert before ending branch of TO (if there is such an insn): + + end: + <possibly-broadcast-cond> + <branch> + + We currently only use differnt FROM and TO when skipping an entire + loop. We could do more if we detected superblocks. */ + +static void +nvptx_single (unsigned mask, basic_block from, basic_block to) +{ + rtx_insn *head = BB_HEAD (from); + rtx_insn *tail = BB_END (to); + unsigned skip_mask = mask; + + /* Find first insn of from block */ + while (head != BB_END (from) && !INSN_P (head)) + head = NEXT_INSN (head); + + /* Find last insn of to block */ + rtx_insn *limit = from == to ? head : BB_HEAD (to); + while (tail != limit && !INSN_P (tail) && !LABEL_P (tail)) + tail = PREV_INSN (tail); + + /* Detect if tail is a branch. */ + rtx tail_branch = NULL_RTX; + rtx cond_branch = NULL_RTX; + if (tail && INSN_P (tail)) + { + tail_branch = PATTERN (tail); + if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx) + tail_branch = NULL_RTX; + else + { + cond_branch = SET_SRC (tail_branch); + if (GET_CODE (cond_branch) != IF_THEN_ELSE) + cond_branch = NULL_RTX; + } + } + + if (tail == head) + { + /* If this is empty, do nothing. */ + if (!head || !INSN_P (head)) + return; + + /* If this is a dummy insn, do nothing. */ + switch (recog_memoized (head)) + { + default: + break; + case CODE_FOR_nvptx_fork: + case CODE_FOR_nvptx_forked: + case CODE_FOR_nvptx_joining: + case CODE_FOR_nvptx_join: + return; + } + + if (cond_branch) + { + /* If we're only doing vector single, there's no need to + emit skip code because we'll not insert anything. */ + if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))) + skip_mask = 0; + } + else if (tail_branch) + /* Block with only unconditional branch. Nothing to do. */ + return; + } + + /* Insert the vector test inside the worker test. */ + unsigned mode; + rtx_insn *before = tail; + for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) + if (GOMP_DIM_MASK (mode) & skip_mask) + { + rtx_code_label *label = gen_label_rtx (); + rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; + + if (!pred) + { + pred = gen_reg_rtx (BImode); + cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; + } + + rtx br; + if (mode == GOMP_DIM_VECTOR) + br = gen_br_true (pred, label); + else + br = gen_br_true_uni (pred, label); + emit_insn_before (br, head); + + LABEL_NUSES (label)++; + if (tail_branch) + before = emit_label_before (label, before); + else + emit_label_after (label, tail); + } + + /* Now deal with propagating the branch condition. */ + if (cond_branch) + { + rtx pvar = XEXP (XEXP (cond_branch, 0), 0); + + if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) + { + /* Vector mode only, do a shuffle. */ + emit_insn_before (nvptx_gen_vcast (pvar), tail); + } + else + { + /* Includes worker mode, do spill & fill. By construction + we should never have worker mode only. */ + wcast_data_t data; + + data.base = worker_bcast_sym; + data.ptr = 0; + + if (worker_bcast_size < GET_MODE_SIZE (SImode)) + worker_bcast_size = GET_MODE_SIZE (SImode); + + data.offset = 0; + emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data), + before); + /* Barrier so other workers can see the write. */ + emit_insn_before (nvptx_wsync (false), tail); + data.offset = 0; + emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail); + /* This barrier is needed to avoid worker zero clobbering + the broadcast buffer before all the other workers have + had a chance to read this instance of it. */ + emit_insn_before (nvptx_wsync (true), tail); + } + + extract_insn (tail); + rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), + UNSPEC_BR_UNIFIED); + validate_change (tail, recog_data.operand_loc[0], unsp, false); + } +} + +/* PAR is a parallel that is being skipped in its entirety according to + MASK. Treat this as skipping a superblock starting at forked + and ending at joining. */ + +static void +nvptx_skip_par (unsigned mask, parallel *par) +{ + basic_block tail = par->join_block; + gcc_assert (tail->preds->length () == 1); + + basic_block pre_tail = (*tail->preds)[0]->src; + gcc_assert (pre_tail->succs->length () == 1); + + nvptx_single (mask, par->forked_block, pre_tail); +} + +/* Process the parallel PAR and all its contained + parallels. We do everything but the neutering. Return mask of + partitioned modes used within this parallel. */ + +static unsigned +nvptx_process_pars (parallel *par) +{ + unsigned inner_mask = par->mask; + + /* Do the inner parallels first. */ + if (par->inner) + { + par->inner_mask = nvptx_process_pars (par->inner); + inner_mask |= par->inner_mask; + } + + if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) + /* No propagation needed for a call. */; + else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + { + nvptx_wpropagate (false, par->forked_block, par->forked_insn); + nvptx_wpropagate (true, par->forked_block, par->fork_insn); + /* Insert begin and end synchronizations. */ + emit_insn_after (nvptx_wsync (false), par->forked_insn); + emit_insn_before (nvptx_wsync (true), par->joining_insn); + } + else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) + nvptx_vpropagate (par->forked_block, par->forked_insn); + + /* Now do siblings. */ + if (par->next) + inner_mask |= nvptx_process_pars (par->next); + return inner_mask; +} + +/* Neuter the parallel described by PAR. We recurse in depth-first + order. MODES are the partitioning of the execution and OUTER is + the partitioning of the parallels we are contained in. */ + +static void +nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer) +{ + unsigned me = (par->mask + & (GOMP_DIM_MASK (GOMP_DIM_WORKER) + | GOMP_DIM_MASK (GOMP_DIM_VECTOR))); + unsigned skip_mask = 0, neuter_mask = 0; + + if (par->inner) + nvptx_neuter_pars (par->inner, modes, outer | me); + + for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) + { + if ((outer | me) & GOMP_DIM_MASK (mode)) + {} /* Mode is partitioned: no neutering. */ + else if (!(modes & GOMP_DIM_MASK (mode))) + {} /* Mode is not used: nothing to do. */ + else if (par->inner_mask & GOMP_DIM_MASK (mode) + || !par->forked_insn) + /* Partitioned in inner parallels, or we're not a partitioned + at all: neuter individual blocks. */ + neuter_mask |= GOMP_DIM_MASK (mode); + else if (!par->parent || !par->parent->forked_insn + || par->parent->inner_mask & GOMP_DIM_MASK (mode)) + /* Parent isn't a parallel or contains this paralleling: skip + parallel at this level. */ + skip_mask |= GOMP_DIM_MASK (mode); + else + {} /* Parent will skip this parallel itself. */ + } + + if (neuter_mask) + { + int ix; + int len = par->blocks.length (); + + for (ix = 0; ix != len; ix++) + { + basic_block block = par->blocks[ix]; + + nvptx_single (neuter_mask, block, block); + } + } + + if (skip_mask) + nvptx_skip_par (skip_mask, par); + + if (par->next) + nvptx_neuter_pars (par->next, modes, outer); +} + /* PTX-specific reorganization + - Scan and release reduction buffers + - Split blocks at fork and join instructions - Compute live registers - Mark now-unused registers, so function begin doesn't declare unused registers. + - Insert state propagation when entering partitioned mode + - Insert neutering instructions when in single mode - Replace subregs with suitable sequences. */ @@ -1989,19 +3047,60 @@ nvptx_reorg (void) thread_prologue_and_epilogue_insns (); + /* Split blocks and record interesting unspecs. */ + bb_insn_map_t bb_insn_map; + + nvptx_split_blocks (&bb_insn_map); + /* Compute live regs */ df_clear_flags (DF_LR_RUN_DCE); df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS); + df_live_add_problem (); + df_live_set_all_dirty (); df_analyze (); regstat_init_n_sets_and_refs (); - int max_regs = max_reg_num (); - + if (dump_file) + df_dump (dump_file); + /* Mark unused regs as unused. */ + int max_regs = max_reg_num (); for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++) if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0) regno_reg_rtx[i] = const0_rtx; + /* Determine launch dimensions of the function. If it is not an + offloaded function (i.e. this is a regular compiler), the + function has no neutering. */ + tree attr = get_oacc_fn_attrib (current_function_decl); + if (attr) + { + /* If we determined this mask before RTL expansion, we could + elide emission of some levels of forks and joins. */ + unsigned mask = 0; + tree dims = TREE_VALUE (attr); + unsigned ix; + + for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) + { + int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); + tree allowed = TREE_PURPOSE (dims); + + if (size != 1 && !(allowed && integer_zerop (allowed))) + mask |= GOMP_DIM_MASK (ix); + } + /* If there is worker neutering, there must be vector + neutering. Otherwise the hardware will fail. */ + gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); + + /* Discover & process partitioned regions. */ + parallel *pars = nvptx_discover_pars (&bb_insn_map); + nvptx_process_pars (pars); + nvptx_neuter_pars (pars, mask, 0); + delete pars; + } + /* Replace subregs. */ nvptx_reorg_subreg (); @@ -2052,6 +3151,26 @@ nvptx_vector_alignment (const_tree type) return MIN (align, BIGGEST_ALIGNMENT); } + +/* Indicate that INSN cannot be duplicated. */ + +static bool +nvptx_cannot_copy_insn_p (rtx_insn *insn) +{ + switch (recog_memoized (insn)) + { + case CODE_FOR_nvptx_shufflesi: + case CODE_FOR_nvptx_shufflesf: + case CODE_FOR_nvptx_barsync: + case CODE_FOR_nvptx_fork: + case CODE_FOR_nvptx_forked: + case CODE_FOR_nvptx_joining: + case CODE_FOR_nvptx_join: + return true; + default: + return false; + } +} /* Record a symbol for mkoffload to enter into the mapping table. */ @@ -2129,6 +3248,19 @@ nvptx_file_end (void) FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) nvptx_record_fndecl (decl, true); fputs (func_decls.str().c_str(), asm_out_file); + + if (worker_bcast_size) + { + /* Define the broadcast buffer. */ + + worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1) + & ~(worker_bcast_align - 1); + + fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_bcast_name); + fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n", + worker_bcast_align, + worker_bcast_name, worker_bcast_size); + } } /* Validate compute dimensions of an OpenACC offload or routine, fill @@ -2141,12 +3273,32 @@ nvptx_goacc_validate_dims (tree ARG_UNUS { bool changed = false; - /* TODO: Leave dimensions unaltered. Partitioned execution needs + /* TODO: Leave dimensions unaltered. Reductions need porting before filtering dimensions makes sense. */ return changed; } - + +/* Determine whether fork & joins are needed. */ + +static bool +nvptx_goacc_fork_join (gcall *call, const int dims[], + bool ARG_UNUSED (is_fork)) +{ + tree arg = gimple_call_arg (call, 2); + unsigned axis = TREE_INT_CST_LOW (arg); + + /* We only care about worker and vector partitioning. */ + if (axis < GOMP_DIM_WORKER) + return false; + + /* If the size is 1, there's no partitioning. */ + if (dims[axis] == 1) + return false; + + return true; +} + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override @@ -2233,9 +3385,15 @@ nvptx_goacc_validate_dims (tree ARG_UNUS #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment +#undef TARGET_CANNOT_COPY_INSN_P +#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p + #undef TARGET_GOACC_VALIDATE_DIMS #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims +#undef TARGET_GOACC_FORK_JOIN +#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-nvptx.h" Index: gcc/config/nvptx/nvptx.h =================================================================== --- gcc/config/nvptx/nvptx.h (revision 229472) +++ gcc/config/nvptx/nvptx.h (working copy) @@ -230,6 +230,7 @@ struct GTY(()) machine_function HOST_WIDE_INT outgoing_stdarg_size; int ret_reg_mode; /* machine_mode not defined yet. */ int punning_buffer_size; + rtx axis_predicate[2]; }; #endif

[OpenACC,2/11] PTX backend changes

Commit Message

Patch