diff mbox

[OpenACC,2/11] PTX backend changes

Message ID 5630DAE0.1020000@acm.org
State New
Headers show

Commit Message

Nathan Sidwell Oct. 28, 2015, 2:25 p.m. UTC
This is the patch I've just committed.

It includes the new target hook overriding, which was originally in patch 3.

nathan
diff mbox

Patch

2015-10-28  Nathan Sidwell  <nathan@codesourcery.com>

	* config/nvptx/nvptx.h (struct machine_function): Add
	axis_predicate.
	* config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork,
	nvptx_expand_oacc_join): Declare.
	* config/nvptx/nvptx.md (UNSPEC_NTID, UNSPEC_TID): Delete.
	(UNSPEC_DIM_SIZE, UNSPEC_SHARED_DATA, UNSPEC_BIT_CONV,
	UNSPEC_SHUFFLE, UNSPEC_BR_UNIFIED): New.
	(UNSPECV_BARSYNC, UNSPECV_DIM_POS, UNSPECV_FORK, UNSPECV_FORKED,
	UNSPECV_JOINING, UNSPECV_JOIN): New.
	(BITS, BITD): New mode iterators.
	(br_true_uni, br_false_uni): New.
	(*oacc_ntid_insn, oacc_ntid, *oacc_tid_insn, oacc_tid): Delete.
	(oacc_dim_size, oacc_dim_pos): New.
	(nvptx_fork, nvptx_forked, nvptx_joining, nvptx_join): New.
	(oacc_fork, oacc_join): New.
	(nvptx_shuffle<mode>, unpack<mode>si2, packsi<mode>2): New.
	(worker_load<mode>, worker_store<mode>): New.
	(nvptx_barsync): New.
	* config/nvptx/nvptx.c: Include gimple.h & dumpfile.h.
	(SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY, SHUFFLE_IDX): Define.
	(worker_bcast_hwm, worker_bcast_align, worker_bcast_name,
	worker_bcast_sym): New.
	(nvptx_option_override): Initialize worker broadcast buffer.
	(nvptx_emit_forking, nvptx_emit_joining): New.
	(nvptx_init_axis_predicate): New.
	(nvptx_declare_function_name): Init axis predicates.
	(nvptx_expand_call): Add fork/join markers around routine call.
	(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): New.
	(nvptx_gen_unpack, nvptx_gen_pack, nvptx_gen_shuffle): New.
	(nvptx_gen_vcast): New.
	(struct wcast_data_t): New.
	(enum propagate_mask): New.
	(nvptx_gen_wcast): New.
	(nvptx_print_operand): Add 'S' case.
	(struct parallel): New.
	(parallel::parallel, parallel::~parallel): New.
	(bb_insn_map_t, insn_bb_t, insn_bb_vec_t): New typedefs.
	(nvptx_split_blocks, nvptx_discover_pre, nvptx_dump_pars,
	nvptx_find_par, nvptx_discover_pars): New.
	(nvptx_propagate): New.
	(vprop_gen, nvptx_vpropagate): New.
	(wprop_gen, nvptx_wpropagate): New.
	(nvptx_wsync): New.
	(nvptx_single, nvptx_skip_par): New.
	(nvptx_process_pars, nvptx_neuter_pars): New.
	(ntptx_reorg): Split blocks, generate parallel structure, apply
	neutering.
	(nvptx_cannot_copy_insn_p): New.
	(nvptx_file_end): Emit worker broadcast decl.
	(nvptx_goacc_fork_join): New.
	(TARGET_CANNOT_COPY_INSN_P): Override.
	(TARGET_GOACC_FORK_JOIN): Override.

Index: gcc/config/nvptx/nvptx-protos.h
===================================================================
--- gcc/config/nvptx/nvptx-protos.h	(revision 229472)
+++ gcc/config/nvptx/nvptx-protos.h	(working copy)
@@ -32,6 +32,8 @@  extern void nvptx_register_pragmas (void
 extern const char *nvptx_section_for_decl (const_tree);
 
 #ifdef RTX_CODE
+extern void nvptx_expand_oacc_fork (unsigned);
+extern void nvptx_expand_oacc_join (unsigned);
 extern void nvptx_expand_call (rtx, rtx);
 extern rtx nvptx_expand_compare (rtx);
 extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 229472)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -49,14 +49,27 @@ 
 
    UNSPEC_ALLOCA
 
-   UNSPEC_NTID
-   UNSPEC_TID
+   UNSPEC_DIM_SIZE
+
+   UNSPEC_SHARED_DATA
+
+   UNSPEC_BIT_CONV
+
+   UNSPEC_SHUFFLE
+   UNSPEC_BR_UNIFIED
 ])
 
 (define_c_enum "unspecv" [
    UNSPECV_LOCK
    UNSPECV_CAS
    UNSPECV_XCHG
+   UNSPECV_BARSYNC
+   UNSPECV_DIM_POS
+
+   UNSPECV_FORK
+   UNSPECV_FORKED
+   UNSPECV_JOINING
+   UNSPECV_JOIN
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -246,6 +259,8 @@ 
 (define_mode_iterator QHSIM [QI HI SI])
 (define_mode_iterator SDFM [SF DF])
 (define_mode_iterator SDCM [SC DC])
+(define_mode_iterator BITS [SI SF])
+(define_mode_iterator BITD [DI DF])
 
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
@@ -817,6 +832,23 @@ 
   ""
   "%J0\\tbra\\t%l1;")
 
+;; unified conditional branch
+(define_insn "br_true_uni"
+  [(set (pc) (if_then_else
+	(ne (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
+		       UNSPEC_BR_UNIFIED) (const_int 0))
+        (label_ref (match_operand 1 "" "")) (pc)))]
+  ""
+  "%j0\\tbra.uni\\t%l1;")
+
+(define_insn "br_false_uni"
+  [(set (pc) (if_then_else
+	(eq (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
+		       UNSPEC_BR_UNIFIED) (const_int 0))
+        (label_ref (match_operand 1 "" "")) (pc)))]
+  ""
+  "%J0\\tbra.uni\\t%l1;")
+
 (define_expand "cbranch<mode>4"
   [(set (pc)
 	(if_then_else (match_operator 0 "nvptx_comparison_operator"
@@ -1308,36 +1340,134 @@ 
   DONE;
 })
 
-(define_insn "*oacc_ntid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_NTID))]
+(define_insn "oacc_dim_size"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "")
+	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")]
+		   UNSPEC_DIM_SIZE))]
   ""
-  "%.\\tmov.u32 %0, %%ntid%d1;")
+{
+  static const char *const asms[] =
+{ /* Must match oacc_loop_levels ordering.  */
+  "%.\\tmov.u32\\t%0, %%nctaid.x;",	/* gang */
+  "%.\\tmov.u32\\t%0, %%ntid.y;",	/* worker */
+  "%.\\tmov.u32\\t%0, %%ntid.x;",	/* vector */
+};
+  return asms[INTVAL (operands[1])];
+})
 
-(define_expand "oacc_ntid"
+(define_insn "oacc_dim_pos"
   [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NTID))]
+	(unspec_volatile:SI [(match_operand:SI 1 "const_int_operand" "")]
+			    UNSPECV_DIM_POS))]
   ""
 {
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
+  static const char *const asms[] =
+{ /* Must match oacc_loop_levels ordering.  */
+  "%.\\tmov.u32\\t%0, %%ctaid.x;",	/* gang */
+  "%.\\tmov.u32\\t%0, %%tid.y;",	/* worker */
+  "%.\\tmov.u32\\t%0, %%tid.x;",	/* vector */
+};
+  return asms[INTVAL (operands[1])];
 })
 
-(define_insn "*oacc_tid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_TID))]
+(define_insn "nvptx_fork"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_FORK)]
   ""
-  "%.\\tmov.u32 %0, %%tid%d1;")
+  "// fork %0;"
+)
 
-(define_expand "oacc_tid"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_TID))]
+(define_insn "nvptx_forked"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_FORKED)]
+  ""
+  "// forked %0;"
+)
+
+(define_insn "nvptx_joining"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_JOINING)]
+  ""
+  "// joining %0;"
+)
+
+(define_insn "nvptx_join"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_JOIN)]
+  ""
+  "// join %0;"
+)
+
+(define_expand "oacc_fork"
+  [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
+        (match_operand:SI 1 "nvptx_general_operand" ""))
+   (unspec_volatile:SI [(match_operand:SI 2 "const_int_operand" "")]
+		        UNSPECV_FORKED)]
   ""
 {
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
+  if (operands[0] != const0_rtx)
+    emit_move_insn (operands[0], operands[1]);
+  nvptx_expand_oacc_fork (INTVAL (operands[2]));
+  DONE;
 })
 
+(define_expand "oacc_join"
+  [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
+        (match_operand:SI 1 "nvptx_general_operand" ""))
+   (unspec_volatile:SI [(match_operand:SI 2 "const_int_operand" "")]
+		        UNSPECV_JOIN)]
+  ""
+{
+  if (operands[0] != const0_rtx)
+    emit_move_insn (operands[0], operands[1]);
+  nvptx_expand_oacc_join (INTVAL (operands[2]));
+  DONE;
+})
+
+;; only 32-bit shuffles exist.
+(define_insn "nvptx_shuffle<mode>"
+  [(set (match_operand:BITS 0 "nvptx_register_operand" "=R")
+	(unspec:BITS
+		[(match_operand:BITS 1 "nvptx_register_operand" "R")
+		 (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")
+		 (match_operand:SI 3 "const_int_operand" "n")]
+		  UNSPEC_SHUFFLE))]
+  ""
+  "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
+
+;; extract parts of a 64 bit object into 2 32-bit ints
+(define_insn "unpack<mode>si2"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+        (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R")
+		    (const_int 0)] UNSPEC_BIT_CONV))
+   (set (match_operand:SI 1 "nvptx_register_operand" "=R")
+        (unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))]
+  ""
+  "%.\\tmov.b64\\t{%0,%1}, %2;")
+
+;; pack 2 32-bit ints into a 64 bit object
+(define_insn "packsi<mode>2"
+  [(set (match_operand:BITD 0 "nvptx_register_operand" "=R")
+        (unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R")
+		      (match_operand:SI 2 "nvptx_register_operand" "R")]
+		    UNSPEC_BIT_CONV))]
+  ""
+  "%.\\tmov.b64\\t%0, {%1,%2};")
+
+(define_insn "worker_load<mode>"
+  [(set (match_operand:SDISDFM 0 "nvptx_register_operand" "=R")
+        (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "m")]
+			 UNSPEC_SHARED_DATA))]
+  ""
+  "%.\\tld.shared%u0\\t%0, %1;")
+
+(define_insn "worker_store<mode>"
+  [(set (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "=m")]
+			 UNSPEC_SHARED_DATA)
+	(match_operand:SDISDFM 0 "nvptx_register_operand" "R"))]
+  ""
+  "%.\\tst.shared%u1\\t%1, %0;")
+
 ;; Atomic insns.
 
 (define_expand "atomic_compare_and_swap<mode>"
@@ -1423,3 +1553,9 @@ 
 	(match_dup 1))]
   "0"
   "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+
+(define_insn "nvptx_barsync"
+  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+		    UNSPECV_BARSYNC)]
+  ""
+  "\\tbar.sync\\t%0;")
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 229472)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -51,14 +51,21 @@ 
 #include "langhooks.h"
 #include "dbxout.h"
 #include "cfgrtl.h"
+#include "gimple.h"
 #include "stor-layout.h"
 #include "builtins.h"
 #include "omp-low.h"
 #include "gomp-constants.h"
+#include "dumpfile.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
 
+#define SHUFFLE_UP 0
+#define SHUFFLE_DOWN 1
+#define SHUFFLE_BFLY 2
+#define SHUFFLE_IDX 3
+
 /* Record the function decls we've written, and the libfuncs and function
    decls corresponding to them.  */
 static std::stringstream func_decls;
@@ -81,6 +88,16 @@  struct tree_hasher : ggc_cache_ptr_hash<
 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
 
+/* Size of buffer needed to broadcast across workers.  This is used
+   for both worker-neutering and worker broadcasting.   It is shared
+   by all functions emitted.  The buffer is placed in shared memory.
+   It'd be nice if PTX supported common blocks, because then this
+   could be shared across TUs (taking the largest size).  */
+static unsigned worker_bcast_size;
+static unsigned worker_bcast_align;
+#define worker_bcast_name "__worker_bcast"
+static GTY(()) rtx worker_bcast_sym;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -108,6 +125,9 @@  nvptx_option_override (void)
   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
   declared_libfuncs_htab
     = hash_table<declared_libfunc_hasher>::create_ggc (17);
+
+  worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
+  worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 }
 
 /* Return the mode to be used when declaring a ptx object for OBJ.
@@ -194,6 +214,47 @@  nvptx_split_reg_p (machine_mode mode)
   return false;
 }
 
+/* Emit forking instructions for MASK.  */
+
+static void
+nvptx_emit_forking (unsigned mask, bool is_call)
+{
+  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  if (mask)
+    {
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
+      
+      /* Emit fork at all levels.  This helps form SESE regions, as
+	 it creates a block with a single successor before entering a
+	 partitooned region.  That is a good candidate for the end of
+	 an SESE region.  */
+      if (!is_call)
+	emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_forked (op));
+    }
+}
+
+/* Emit joining instructions for MASK.  */
+
+static void
+nvptx_emit_joining (unsigned mask, bool is_call)
+{
+  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  if (mask)
+    {
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
+
+      /* Emit joining for all non-call pars to ensure there's a single
+	 predecessor for the block the join insn ends up in.  This is
+	 needed for skipping entire loops.  */
+      if (!is_call)
+	emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_join (op));
+    }
+}
+
 #define PASS_IN_REG_P(MODE, TYPE)				\
   ((GET_MODE_CLASS (MODE) == MODE_INT				\
     || GET_MODE_CLASS (MODE) == MODE_FLOAT			\
@@ -500,6 +561,19 @@  nvptx_record_needed_fndecl (tree decl)
     *slot = decl;
 }
 
+/* Emit code to initialize the REGNO predicate register to indicate
+   whether we are not lane zero on the NAME axis.  */
+
+static void
+nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
+{
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
+  fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
+  fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
+  fprintf (file, "\t}\n");
+}
+
 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
    function, including local var decls and copies from the arguments to
    local regs.  */
@@ -623,6 +697,14 @@  nvptx_declare_function_name (FILE *file,
   if (stdarg_p (fntype))
     fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
 	     GET_MODE_BITSIZE (Pmode));
+
+  /* Emit axis predicates. */
+  if (cfun->machine->axis_predicate[0])
+    nvptx_init_axis_predicate (file,
+			       REGNO (cfun->machine->axis_predicate[0]), "y");
+  if (cfun->machine->axis_predicate[1])
+    nvptx_init_axis_predicate (file,
+			       REGNO (cfun->machine->axis_predicate[1]), "x");
 }
 
 /* Output a return instruction.  Also copy the return value to its outgoing
@@ -779,6 +861,7 @@  nvptx_expand_call (rtx retval, rtx addre
   bool external_decl = false;
   rtx varargs = NULL_RTX;
   tree decl_type = NULL_TREE;
+  unsigned parallel = 0;
 
   for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
     nargs++;
@@ -799,6 +882,22 @@  nvptx_expand_call (rtx retval, rtx addre
 	    cfun->machine->has_call_with_sc = true;
 	  if (DECL_EXTERNAL (decl))
 	    external_decl = true;
+	  tree attr = get_oacc_fn_attrib (decl);
+	  if (attr)
+	    {
+	      tree dims = TREE_VALUE (attr);
+
+	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
+	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
+		{
+		  if (TREE_PURPOSE (dims)
+		      && !integer_zerop (TREE_PURPOSE (dims)))
+		    break;
+		  /* Not on this axis.  */
+		  parallel ^= GOMP_DIM_MASK (ix);
+		  dims = TREE_CHAIN (dims);
+		}
+	    }
 	}
     }
 
@@ -860,7 +959,11 @@  nvptx_expand_call (rtx retval, rtx addre
 	  write_func_decl_from_insn (func_decls, retval, pat, callee);
 	}
     }
+
+  nvptx_emit_forking (parallel, true);
   emit_call_insn (pat);
+  nvptx_emit_joining (parallel, true);
+
   if (tmp_retval != retval)
     emit_move_insn (retval, tmp_retval);
 }
@@ -1069,6 +1172,214 @@  nvptx_expand_compare (rtx compare)
   return gen_rtx_NE (BImode, pred, const0_rtx);
 }
 
+/* Expand the oacc fork & join primitive into ptx-required unspecs.  */
+
+void
+nvptx_expand_oacc_fork (unsigned mode)
+{
+  nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
+}
+
+void
+nvptx_expand_oacc_join (unsigned mode)
+{
+  nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
+}
+
+/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
+   objects.  */
+
+static rtx
+nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
+{
+  rtx res;
+  
+  switch (GET_MODE (src))
+    {
+    case DImode:
+      res = gen_unpackdisi2 (dst0, dst1, src);
+      break;
+    case DFmode:
+      res = gen_unpackdfsi2 (dst0, dst1, src);
+      break;
+    default: gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
+   object.  */
+
+static rtx
+nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
+{
+  rtx res;
+  
+  switch (GET_MODE (dst))
+    {
+    case DImode:
+      res = gen_packsidi2 (dst, src0, src1);
+      break;
+    case DFmode:
+      res = gen_packsidf2 (dst, src0, src1);
+      break;
+    default: gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Generate an instruction or sequence to broadcast register REG
+   across the vectors of a single warp.  */
+
+static rtx
+nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
+{
+  rtx res;
+
+  switch (GET_MODE (dst))
+    {
+    case SImode:
+      res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
+      break;
+    case SFmode:
+      res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
+      break;
+    case DImode:
+    case DFmode:
+      {
+	rtx tmp0 = gen_reg_rtx (SImode);
+	rtx tmp1 = gen_reg_rtx (SImode);
+
+	start_sequence ();
+	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
+	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
+	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
+	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+    case BImode:
+      {
+	rtx tmp = gen_reg_rtx (SImode);
+	
+	start_sequence ();
+	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
+	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
+	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+      
+    default:
+      gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Generate an instruction or sequence to broadcast register REG
+   across the vectors of a single warp.  */
+
+static rtx
+nvptx_gen_vcast (rtx reg)
+{
+  return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
+}
+
+/* Structure used when generating a worker-level spill or fill.  */
+
+struct wcast_data_t
+{
+  rtx base;  /* Register holding base addr of buffer.  */
+  rtx ptr;  /* Iteration var,  if needed.  */
+  unsigned offset; /* Offset into worker buffer.  */
+};
+
+/* Direction of the spill/fill and looping setup/teardown indicator.  */
+
+enum propagate_mask
+  {
+    PM_read = 1 << 0,
+    PM_write = 1 << 1,
+    PM_loop_begin = 1 << 2,
+    PM_loop_end = 1 << 3,
+
+    PM_read_write = PM_read | PM_write
+  };
+
+/* Generate instruction(s) to spill or fill register REG to/from the
+   worker broadcast array.  PM indicates what is to be done, REP
+   how many loop iterations will be executed (0 for not a loop).  */
+   
+static rtx
+nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
+{
+  rtx  res;
+  machine_mode mode = GET_MODE (reg);
+
+  switch (mode)
+    {
+    case BImode:
+      {
+	rtx tmp = gen_reg_rtx (SImode);
+	
+	start_sequence ();
+	if (pm & PM_read)
+	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
+	emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+	if (pm & PM_write)
+	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+
+    default:
+      {
+	rtx addr = data->ptr;
+
+	if (!addr)
+	  {
+	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
+
+	    if (align > worker_bcast_align)
+	      worker_bcast_align = align;
+	    data->offset = (data->offset + align - 1) & ~(align - 1);
+	    addr = data->base;
+	    if (data->offset)
+	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
+	  }
+	
+	addr = gen_rtx_MEM (mode, addr);
+	addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
+	if (pm == PM_read)
+	  res = gen_rtx_SET (addr, reg);
+	else if (pm == PM_write)
+	  res = gen_rtx_SET (reg, addr);
+	else
+	  gcc_unreachable ();
+
+	if (data->ptr)
+	  {
+	    /* We're using a ptr, increment it.  */
+	    start_sequence ();
+	    
+	    emit_insn (res);
+	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
+				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
+	    res = get_insns ();
+	    end_sequence ();
+	  }
+	else
+	  rep = 1;
+	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
+      }
+      break;
+    }
+  return res;
+}
+
 /* When loading an operand ORIG_OP, verify whether an address space
    conversion to generic is required, and if so, perform it.  Also
    check for SYMBOL_REFs for function decls and call
@@ -1660,6 +1971,7 @@  nvptx_print_operand_address (FILE *file,
    c -- print an opcode suffix for a comparison operator, including a type code
    d -- print a CONST_INT as a vector dimension (x, y, or z)
    f -- print a full reg even for something that must always be split
+   S -- print a shuffle kind specified by CONST_INT
    t -- print a type opcode suffix, promoting QImode to 32 bits
    T -- print a type size in bits
    u -- print a type opcode suffix without promotions.  */
@@ -1723,6 +2035,15 @@  nvptx_print_operand (FILE *file, rtx x,
       fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
       break;
 
+    case 'S':
+      {
+	unsigned kind = UINTVAL (x);
+	static const char *const kinds[] = 
+	  {"up", "down", "bfly", "idx"};
+	fprintf (file, ".%s", kinds[kind]);
+      }
+      break;
+
     case 'T':
       fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
       break;
@@ -1973,10 +2294,747 @@  nvptx_reorg_subreg (void)
     }
 }
 
+/* Loop structure of the function. The entire function is described as
+   a NULL loop.  We should be able to extend this to represent
+   superblocks.  */
+
+struct parallel
+{
+  /* Parent parallel.  */
+  parallel *parent;
+  
+  /* Next sibling parallel.  */
+  parallel *next;
+
+  /* First child parallel.  */
+  parallel *inner;
+
+  /* Partitioning mask of the parallel.  */
+  unsigned mask;
+
+  /* Partitioning used within inner parallels. */
+  unsigned inner_mask;
+
+  /* Location of parallel forked and join.  The forked is the first
+     block in the parallel and the join is the first block after of
+     the partition.  */
+  basic_block forked_block;
+  basic_block join_block;
+
+  rtx_insn *forked_insn;
+  rtx_insn *join_insn;
+
+  rtx_insn *fork_insn;
+  rtx_insn *joining_insn;
+
+  /* Basic blocks in this parallel, but not in child parallels.  The
+     FORKED and JOINING blocks are in the partition.  The FORK and JOIN
+     blocks are not.  */
+  auto_vec<basic_block> blocks;
+
+public:
+  parallel (parallel *parent, unsigned mode);
+  ~parallel ();
+};
+
+/* Constructor links the new parallel into it's parent's chain of
+   children.  */
+
+parallel::parallel (parallel *parent_, unsigned mask_)
+  :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
+{
+  forked_block = join_block = 0;
+  forked_insn = join_insn = 0;
+  fork_insn = joining_insn = 0;
+  
+  if (parent)
+    {
+      next = parent->inner;
+      parent->inner = this;
+    }
+}
+
+parallel::~parallel ()
+{
+  delete inner;
+  delete next;
+}
+
+/* Map of basic blocks to insns */
+typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
+
+/* A tuple of an insn of interest and the BB in which it resides.  */
+typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
+typedef auto_vec<insn_bb_t> insn_bb_vec_t;
+
+/* Split basic blocks such that each forked and join unspecs are at
+   the start of their basic blocks.  Thus afterwards each block will
+   have a single partitioning mode.  We also do the same for return
+   insns, as they are executed by every thread.  Return the
+   partitioning mode of the function as a whole.  Populate MAP with
+   head and tail blocks.  We also clear the BB visited flag, which is
+   used when finding partitions.  */
+
+static void
+nvptx_split_blocks (bb_insn_map_t *map)
+{
+  insn_bb_vec_t worklist;
+  basic_block block;
+  rtx_insn *insn;
+
+  /* Locate all the reorg instructions of interest.  */
+  FOR_ALL_BB_FN (block, cfun)
+    {
+      bool seen_insn = false;
+
+      /* Clear visited flag, for use by parallel locator  */
+      block->flags &= ~BB_VISITED;
+
+      FOR_BB_INSNS (block, insn)
+	{
+	  if (!INSN_P (insn))
+	    continue;
+	  switch (recog_memoized (insn))
+	    {
+	    default:
+	      seen_insn = true;
+	      continue;
+	    case CODE_FOR_nvptx_forked:
+	    case CODE_FOR_nvptx_join:
+	      break;
+
+	    case CODE_FOR_return:
+	      /* We also need to split just before return insns, as
+		 that insn needs executing by all threads, but the
+		 block it is in probably does not.  */
+	      break;
+	    }
+
+	  if (seen_insn)
+	    /* We've found an instruction that  must be at the start of
+	       a block, but isn't.  Add it to the worklist.  */
+	    worklist.safe_push (insn_bb_t (insn, block));
+	  else
+	    /* It was already the first instruction.  Just add it to
+	       the map.  */
+	    map->get_or_insert (block) = insn;
+	  seen_insn = true;
+	}
+    }
+
+  /* Split blocks on the worklist.  */
+  unsigned ix;
+  insn_bb_t *elt;
+  basic_block remap = 0;
+  for (ix = 0; worklist.iterate (ix, &elt); ix++)
+    {
+      if (remap != elt->second)
+	{
+	  block = elt->second;
+	  remap = block;
+	}
+      
+      /* Split block before insn. The insn is in the new block  */
+      edge e = split_block (block, PREV_INSN (elt->first));
+
+      block = e->dest;
+      map->get_or_insert (block) = elt->first;
+    }
+}
+
+/* BLOCK is a basic block containing a head or tail instruction.
+   Locate the associated prehead or pretail instruction, which must be
+   in the single predecessor block.  */
+
+static rtx_insn *
+nvptx_discover_pre (basic_block block, int expected)
+{
+  gcc_assert (block->preds->length () == 1);
+  basic_block pre_block = (*block->preds)[0]->src;
+  rtx_insn *pre_insn;
+
+  for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
+       pre_insn = PREV_INSN (pre_insn))
+    gcc_assert (pre_insn != BB_HEAD (pre_block));
+
+  gcc_assert (recog_memoized (pre_insn) == expected);
+  return pre_insn;
+}
+
+/* Dump this parallel and all its inner parallels.  */
+
+static void
+nvptx_dump_pars (parallel *par, unsigned depth)
+{
+  fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
+	   depth, par->mask,
+	   par->forked_block ? par->forked_block->index : -1,
+	   par->join_block ? par->join_block->index : -1);
+
+  fprintf (dump_file, "    blocks:");
+
+  basic_block block;
+  for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
+    fprintf (dump_file, " %d", block->index);
+  fprintf (dump_file, "\n");
+  if (par->inner)
+    nvptx_dump_pars (par->inner, depth + 1);
+
+  if (par->next)
+    nvptx_dump_pars (par->next, depth);
+}
+
+/* If BLOCK contains a fork/join marker, process it to create or
+   terminate a loop structure.  Add this block to the current loop,
+   and then walk successor blocks.   */
+
+static parallel *
+nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
+{
+  if (block->flags & BB_VISITED)
+    return par;
+  block->flags |= BB_VISITED;
+
+  if (rtx_insn **endp = map->get (block))
+    {
+      rtx_insn *end = *endp;
+
+      /* This is a block head or tail, or return instruction.  */
+      switch (recog_memoized (end))
+	{
+	case CODE_FOR_return:
+	  /* Return instructions are in their own block, and we
+	     don't need to do anything more.  */
+	  return par;
+
+	case CODE_FOR_nvptx_forked:
+	  /* Loop head, create a new inner loop and add it into
+	     our parent's child list.  */
+	  {
+	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+	    gcc_assert (mask);
+	    par = new parallel (par, mask);
+	    par->forked_block = block;
+	    par->forked_insn = end;
+	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	      par->fork_insn
+		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
+	  }
+	  break;
+
+	case CODE_FOR_nvptx_join:
+	  /* A loop tail.  Finish the current loop and return to
+	     parent.  */
+	  {
+	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+	    gcc_assert (par->mask == mask);
+	    par->join_block = block;
+	    par->join_insn = end;
+	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	      par->joining_insn
+		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
+	    par = par->parent;
+	  }
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  if (par)
+    /* Add this block onto the current loop's list of blocks.  */
+    par->blocks.safe_push (block);
+  else
+    /* This must be the entry block.  Create a NULL parallel.  */
+    par = new parallel (0, 0);
+
+  /* Walk successor blocks.  */
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, block->succs)
+    nvptx_find_par (map, par, e->dest);
+
+  return par;
+}
+
+/* DFS walk the CFG looking for fork & join markers.  Construct
+   loop structures as we go.  MAP is a mapping of basic blocks
+   to head & tail markers, discovered when splitting blocks.  This
+   speeds up the discovery.  We rely on the BB visited flag having
+   been cleared when splitting blocks.  */
+
+static parallel *
+nvptx_discover_pars (bb_insn_map_t *map)
+{
+  basic_block block;
+
+  /* Mark exit blocks as visited.  */
+  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  block->flags |= BB_VISITED;
+
+  /* And entry block as not.  */
+  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+
+  parallel *par = nvptx_find_par (map, 0, block);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nLoops\n");
+      nvptx_dump_pars (par, 0);
+      fprintf (dump_file, "\n");
+    }
+  
+  return par;
+}
+
+/* Propagate live state at the start of a partitioned region.  BLOCK
+   provides the live register information, and might not contain
+   INSN. Propagation is inserted just after INSN. RW indicates whether
+   we are reading and/or writing state.  This
+   separation is needed for worker-level proppagation where we
+   essentially do a spill & fill.  FN is the underlying worker
+   function to generate the propagation instructions for single
+   register.  DATA is user data.
+
+   We propagate the live register set and the entire frame.  We could
+   do better by (a) propagating just the live set that is used within
+   the partitioned regions and (b) only propagating stack entries that
+   are used.  The latter might be quite hard to determine.  */
+
+typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
+
+static void
+nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
+		 propagator_fn fn, void *data)
+{
+  bitmap live = DF_LIVE_IN (block);
+  bitmap_iterator iterator;
+  unsigned ix;
+
+  /* Copy the frame array.  */
+  HOST_WIDE_INT fs = get_frame_size ();
+  if (fs)
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+      rtx idx = NULL_RTX;
+      rtx ptr = gen_reg_rtx (Pmode);
+      rtx pred = NULL_RTX;
+      rtx_code_label *label = NULL;
+
+      gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
+      fs /= GET_MODE_SIZE (DImode);
+      /* Detect single iteration loop. */
+      if (fs == 1)
+	fs = 0;
+
+      start_sequence ();
+      emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
+      if (fs)
+	{
+	  idx = gen_reg_rtx (SImode);
+	  pred = gen_reg_rtx (BImode);
+	  label = gen_label_rtx ();
+	  
+	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
+	  /* Allow worker function to initialize anything needed.  */
+	  rtx init = fn (tmp, PM_loop_begin, fs, data);
+	  if (init)
+	    emit_insn (init);
+	  emit_label (label);
+	  LABEL_NUSES (label)++;
+	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
+	}
+      if (rw & PM_read)
+	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
+      emit_insn (fn (tmp, rw, fs, data));
+      if (rw & PM_write)
+	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
+      if (fs)
+	{
+	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
+	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
+	  emit_insn (gen_br_true_uni (pred, label));
+	  rtx fini = fn (tmp, PM_loop_end, fs, data);
+	  if (fini)
+	    emit_insn (fini);
+	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
+	}
+      emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
+      emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
+      rtx cpy = get_insns ();
+      end_sequence ();
+      insn = emit_insn_after (cpy, insn);
+    }
+
+  /* Copy live registers.  */
+  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+    {
+      rtx reg = regno_reg_rtx[ix];
+
+      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+	{
+	  rtx bcast = fn (reg, rw, 0, data);
+
+	  insn = emit_insn_after (bcast, insn);
+	}
+    }
+}
+
+/* Worker for nvptx_vpropagate.  */
+
+static rtx
+vprop_gen (rtx reg, propagate_mask pm,
+	   unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+{
+  if (!(pm & PM_read_write))
+    return 0;
+  
+  return nvptx_gen_vcast (reg);
+}
+
+/* Propagate state that is live at start of BLOCK across the vectors
+   of a single warp.  Propagation is inserted just after INSN.   */
+
+static void
+nvptx_vpropagate (basic_block block, rtx_insn *insn)
+{
+  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+}
+
+/* Worker for nvptx_wpropagate.  */
+
+static rtx
+wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+{
+  wcast_data_t *data = (wcast_data_t *)data_;
+
+  if (pm & PM_loop_begin)
+    {
+      /* Starting a loop, initialize pointer.    */
+      unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
+
+      if (align > worker_bcast_align)
+	worker_bcast_align = align;
+      data->offset = (data->offset + align - 1) & ~(align - 1);
+
+      data->ptr = gen_reg_rtx (Pmode);
+
+      return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
+    }
+  else if (pm & PM_loop_end)
+    {
+      rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
+      data->ptr = NULL_RTX;
+      return clobber;
+    }
+  else
+    return nvptx_gen_wcast (reg, pm, rep, data);
+}
+
+/* Spill or fill live state that is live at start of BLOCK.  PRE_P
+   indicates if this is just before partitioned mode (do spill), or
+   just after it starts (do fill). Sequence is inserted just after
+   INSN.  */
+
+static void
+nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+{
+  wcast_data_t data;
+
+  data.base = gen_reg_rtx (Pmode);
+  data.offset = 0;
+  data.ptr = NULL_RTX;
+
+  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  if (data.offset)
+    {
+      /* Stuff was emitted, initialize the base pointer now.  */
+      rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
+      emit_insn_after (init, insn);
+      
+      if (worker_bcast_size < data.offset)
+	worker_bcast_size = data.offset;
+    }
+}
+
+/* Emit a worker-level synchronization barrier.  We use different
+   markers for before and after synchronizations.  */
+
+static rtx
+nvptx_wsync (bool after)
+{
+  return gen_nvptx_barsync (GEN_INT (after));
+}
+
+/* Single neutering according to MASK.  FROM is the incoming block and
+   TO is the outgoing block.  These may be the same block. Insert at
+   start of FROM:
+   
+     if (tid.<axis>) goto end.
+
+   and insert before ending branch of TO (if there is such an insn):
+
+     end:
+     <possibly-broadcast-cond>
+     <branch>
+
+   We currently only use differnt FROM and TO when skipping an entire
+   loop.  We could do more if we detected superblocks.  */
+
+static void
+nvptx_single (unsigned mask, basic_block from, basic_block to)
+{
+  rtx_insn *head = BB_HEAD (from);
+  rtx_insn *tail = BB_END (to);
+  unsigned skip_mask = mask;
+
+  /* Find first insn of from block */
+  while (head != BB_END (from) && !INSN_P (head))
+    head = NEXT_INSN (head);
+
+  /* Find last insn of to block */
+  rtx_insn *limit = from == to ? head : BB_HEAD (to);
+  while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
+    tail = PREV_INSN (tail);
+
+  /* Detect if tail is a branch.  */
+  rtx tail_branch = NULL_RTX;
+  rtx cond_branch = NULL_RTX;
+  if (tail && INSN_P (tail))
+    {
+      tail_branch = PATTERN (tail);
+      if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
+	tail_branch = NULL_RTX;
+      else
+	{
+	  cond_branch = SET_SRC (tail_branch);
+	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
+	    cond_branch = NULL_RTX;
+	}
+    }
+
+  if (tail == head)
+    {
+      /* If this is empty, do nothing.  */
+      if (!head || !INSN_P (head))
+	return;
+
+      /* If this is a dummy insn, do nothing.  */
+      switch (recog_memoized (head))
+	{
+	default:
+	  break;
+	case CODE_FOR_nvptx_fork:
+	case CODE_FOR_nvptx_forked:
+	case CODE_FOR_nvptx_joining:
+	case CODE_FOR_nvptx_join:
+	  return;
+	}
+
+      if (cond_branch)
+	{
+	  /* If we're only doing vector single, there's no need to
+	     emit skip code because we'll not insert anything.  */
+	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
+	    skip_mask = 0;
+	}
+      else if (tail_branch)
+	/* Block with only unconditional branch.  Nothing to do.  */
+	return;
+    }
+
+  /* Insert the vector test inside the worker test.  */
+  unsigned mode;
+  rtx_insn *before = tail;
+  for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
+    if (GOMP_DIM_MASK (mode) & skip_mask)
+      {
+	rtx_code_label *label = gen_label_rtx ();
+	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
+
+	if (!pred)
+	  {
+	    pred = gen_reg_rtx (BImode);
+	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
+	  }
+	
+	rtx br;
+	if (mode == GOMP_DIM_VECTOR)
+	  br = gen_br_true (pred, label);
+	else
+	  br = gen_br_true_uni (pred, label);
+	emit_insn_before (br, head);
+
+	LABEL_NUSES (label)++;
+	if (tail_branch)
+	  before = emit_label_before (label, before);
+	else
+	  emit_label_after (label, tail);
+      }
+
+  /* Now deal with propagating the branch condition.  */
+  if (cond_branch)
+    {
+      rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
+
+      if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
+	{
+	  /* Vector mode only, do a shuffle.  */
+	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
+	}
+      else
+	{
+	  /* Includes worker mode, do spill & fill.  By construction
+	     we should never have worker mode only. */
+	  wcast_data_t data;
+
+	  data.base = worker_bcast_sym;
+	  data.ptr = 0;
+
+	  if (worker_bcast_size < GET_MODE_SIZE (SImode))
+	    worker_bcast_size = GET_MODE_SIZE (SImode);
+
+	  data.offset = 0;
+	  emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+			    before);
+	  /* Barrier so other workers can see the write.  */
+	  emit_insn_before (nvptx_wsync (false), tail);
+	  data.offset = 0;
+	  emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+	  /* This barrier is needed to avoid worker zero clobbering
+	     the broadcast buffer before all the other workers have
+	     had a chance to read this instance of it.  */
+	  emit_insn_before (nvptx_wsync (true), tail);
+	}
+
+      extract_insn (tail);
+      rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
+				 UNSPEC_BR_UNIFIED);
+      validate_change (tail, recog_data.operand_loc[0], unsp, false);
+    }
+}
+
+/* PAR is a parallel that is being skipped in its entirety according to
+   MASK.  Treat this as skipping a superblock starting at forked
+   and ending at joining.  */
+
+static void
+nvptx_skip_par (unsigned mask, parallel *par)
+{
+  basic_block tail = par->join_block;
+  gcc_assert (tail->preds->length () == 1);
+
+  basic_block pre_tail = (*tail->preds)[0]->src;
+  gcc_assert (pre_tail->succs->length () == 1);
+
+  nvptx_single (mask, par->forked_block, pre_tail);
+}
+
+/* Process the parallel PAR and all its contained
+   parallels.  We do everything but the neutering.  Return mask of
+   partitioned modes used within this parallel.  */
+
+static unsigned
+nvptx_process_pars (parallel *par)
+{
+  unsigned inner_mask = par->mask;
+
+  /* Do the inner parallels first.  */
+  if (par->inner)
+    {
+      par->inner_mask = nvptx_process_pars (par->inner);
+      inner_mask |= par->inner_mask;
+    }
+
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+    /* No propagation needed for a call.  */;
+ else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+    {
+      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
+      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
+      /* Insert begin and end synchronizations.  */
+      emit_insn_after (nvptx_wsync (false), par->forked_insn);
+      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+    }
+  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+    nvptx_vpropagate (par->forked_block, par->forked_insn);
+
+  /* Now do siblings.  */
+  if (par->next)
+    inner_mask |= nvptx_process_pars (par->next);
+  return inner_mask;
+}
+
+/* Neuter the parallel described by PAR.  We recurse in depth-first
+   order.  MODES are the partitioning of the execution and OUTER is
+   the partitioning of the parallels we are contained in.  */
+
+static void
+nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
+{
+  unsigned me = (par->mask
+		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+  unsigned  skip_mask = 0, neuter_mask = 0;
+  
+  if (par->inner)
+    nvptx_neuter_pars (par->inner, modes, outer | me);
+
+  for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
+    {
+      if ((outer | me) & GOMP_DIM_MASK (mode))
+	{} /* Mode is partitioned: no neutering.  */
+      else if (!(modes & GOMP_DIM_MASK (mode)))
+	{} /* Mode is not used: nothing to do.  */  
+      else if (par->inner_mask & GOMP_DIM_MASK (mode)
+	       || !par->forked_insn)
+	/* Partitioned in inner parallels, or we're not a partitioned
+	   at all: neuter individual blocks.  */
+	neuter_mask |= GOMP_DIM_MASK (mode);
+      else if (!par->parent || !par->parent->forked_insn
+	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
+	/* Parent isn't a parallel or contains this paralleling: skip
+	   parallel at this level.  */
+	skip_mask |= GOMP_DIM_MASK (mode);
+      else
+	{} /* Parent will skip this parallel itself.  */
+    }
+
+  if (neuter_mask)
+    {
+      int ix;
+      int len = par->blocks.length ();
+
+      for (ix = 0; ix != len; ix++)
+	{
+	  basic_block block = par->blocks[ix];
+
+	  nvptx_single (neuter_mask, block, block);
+	}
+    }
+
+  if (skip_mask)
+      nvptx_skip_par (skip_mask, par);
+  
+  if (par->next)
+    nvptx_neuter_pars (par->next, modes, outer);
+}
+
 /* PTX-specific reorganization
+   - Scan and release reduction buffers
+   - Split blocks at fork and join instructions
    - Compute live registers
    - Mark now-unused registers, so function begin doesn't declare
    unused registers.
+   - Insert state propagation when entering partitioned mode
+   - Insert neutering instructions when in single mode
    - Replace subregs with suitable sequences.
 */
 
@@ -1989,19 +3047,60 @@  nvptx_reorg (void)
 
   thread_prologue_and_epilogue_insns ();
 
+  /* Split blocks and record interesting unspecs.  */
+  bb_insn_map_t bb_insn_map;
+
+  nvptx_split_blocks (&bb_insn_map);
+
   /* Compute live regs */
   df_clear_flags (DF_LR_RUN_DCE);
   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
+  df_live_add_problem ();
+  df_live_set_all_dirty ();
   df_analyze ();
   regstat_init_n_sets_and_refs ();
 
-  int max_regs = max_reg_num ();
-
+  if (dump_file)
+    df_dump (dump_file);
+  
   /* Mark unused regs as unused.  */
+  int max_regs = max_reg_num ();
   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
       regno_reg_rtx[i] = const0_rtx;
 
+  /* Determine launch dimensions of the function.  If it is not an
+     offloaded function  (i.e. this is a regular compiler), the
+     function has no neutering.  */
+  tree attr = get_oacc_fn_attrib (current_function_decl);
+  if (attr)
+    {
+      /* If we determined this mask before RTL expansion, we could
+	 elide emission of some levels of forks and joins.  */
+      unsigned mask = 0;
+      tree dims = TREE_VALUE (attr);
+      unsigned ix;
+
+      for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+	{
+	  int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
+	  tree allowed = TREE_PURPOSE (dims);
+
+	  if (size != 1 && !(allowed && integer_zerop (allowed)))
+	    mask |= GOMP_DIM_MASK (ix);
+	}
+      /* If there is worker neutering, there must be vector
+	 neutering.  Otherwise the hardware will fail.  */
+      gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		  || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+
+      /* Discover & process partitioned regions.  */
+      parallel *pars = nvptx_discover_pars (&bb_insn_map);
+      nvptx_process_pars (pars);
+      nvptx_neuter_pars (pars, mask, 0);
+      delete pars;
+    }
+
   /* Replace subregs.  */
   nvptx_reorg_subreg ();
 
@@ -2052,6 +3151,26 @@  nvptx_vector_alignment (const_tree type)
 
   return MIN (align, BIGGEST_ALIGNMENT);
 }
+
+/* Indicate that INSN cannot be duplicated.   */
+
+static bool
+nvptx_cannot_copy_insn_p (rtx_insn *insn)
+{
+  switch (recog_memoized (insn))
+    {
+    case CODE_FOR_nvptx_shufflesi:
+    case CODE_FOR_nvptx_shufflesf:
+    case CODE_FOR_nvptx_barsync:
+    case CODE_FOR_nvptx_fork:
+    case CODE_FOR_nvptx_forked:
+    case CODE_FOR_nvptx_joining:
+    case CODE_FOR_nvptx_join:
+      return true;
+    default:
+      return false;
+    }
+}
 
 /* Record a symbol for mkoffload to enter into the mapping table.  */
 
@@ -2129,6 +3248,19 @@  nvptx_file_end (void)
   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
     nvptx_record_fndecl (decl, true);
   fputs (func_decls.str().c_str(), asm_out_file);
+
+  if (worker_bcast_size)
+    {
+      /* Define the broadcast buffer.  */
+
+      worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
+	& ~(worker_bcast_align - 1);
+      
+      fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_bcast_name);
+      fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
+	       worker_bcast_align,
+	       worker_bcast_name, worker_bcast_size);
+    }
 }
 
 /* Validate compute dimensions of an OpenACC offload or routine, fill
@@ -2141,12 +3273,32 @@  nvptx_goacc_validate_dims (tree ARG_UNUS
 {
   bool changed = false;
 
-  /* TODO: Leave dimensions unaltered.  Partitioned execution needs
+  /* TODO: Leave dimensions unaltered.  Reductions need
      porting before filtering dimensions makes sense.  */
 
   return changed;
 }
-
+
+/* Determine whether fork & joins are needed.  */
+
+static bool
+nvptx_goacc_fork_join (gcall *call, const int dims[],
+		       bool ARG_UNUSED (is_fork))
+{
+  tree arg = gimple_call_arg (call, 2);
+  unsigned axis = TREE_INT_CST_LOW (arg);
+
+  /* We only care about worker and vector partitioning.  */
+  if (axis < GOMP_DIM_WORKER)
+    return false;
+
+  /* If the size is 1, there's no partitioning.  */
+  if (dims[axis] == 1)
+    return false;
+
+  return true;
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -2233,9 +3385,15 @@  nvptx_goacc_validate_dims (tree ARG_UNUS
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
 
+#undef TARGET_CANNOT_COPY_INSN_P
+#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
+
 #undef TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
 
+#undef TARGET_GOACC_FORK_JOIN
+#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
Index: gcc/config/nvptx/nvptx.h
===================================================================
--- gcc/config/nvptx/nvptx.h	(revision 229472)
+++ gcc/config/nvptx/nvptx.h	(working copy)
@@ -230,6 +230,7 @@  struct GTY(()) machine_function
   HOST_WIDE_INT outgoing_stdarg_size;
   int ret_reg_mode; /* machine_mode not defined yet. */
   int punning_buffer_size;
+  rtx axis_predicate[2];
 };
 #endif