diff mbox series

[2/3,RFC] fsra: support ARG_PARTS

Message ID 20240227070412.3471038-3-guojiufu@linux.ibm.com
State New
Headers show
Series fsra: Add final gimple sra before expander | expand

Commit Message

Jiufu Guo Feb. 27, 2024, 7:04 a.m. UTC
This patch adds IFN_ARG_PARTS, and generate this IFN for parameters access
in fsra pass.  And this IFN is expanded according to the incoming registers
of the parameter.  "fsra" is tunned for the access of parameters.

	PR target/108073

gcc/ChangeLog:

	* internal-fn.cc (query_position_in_parallel): New function.
	(construct_reg_seq): New function.
	(get_incoming_element): New function.
	(reference_alias_ptr_type): Extern declare.
	(expand_ARG_PARTS): New expand function.
	* internal-fn.def (ARG_PARTS): New IFN.
	* tree-sra.cc (scan_function): Update for fsra.
	(analyze_access_subtree): Enable reading ARG analyze for fsra.
	(generate_subtree_copies): Update to generate IFN_ARG_PARTS.

gcc/testsuite/ChangeLog:

	* g++.target/powerpc/pr102024.C: Update.
	* gcc.target/powerpc/pr108073-1.c: New test.
	* gcc.target/powerpc/pr108073.c: New test.

---
 gcc/internal-fn.cc                            | 164 ++++++++++++++++++
 gcc/internal-fn.def                           |   3 +
 gcc/tree-sra.cc                               |  43 ++++-
 gcc/testsuite/g++.target/powerpc/pr102024.C   |   2 +-
 gcc/testsuite/gcc.target/powerpc/pr108073-1.c |  76 ++++++++
 gcc/testsuite/gcc.target/powerpc/pr108073.c   |  74 ++++++++
 6 files changed, 354 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index a07f25f3aee..ee19e155628 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3393,6 +3393,170 @@  expand_DEFERRED_INIT (internal_fn, gcall *stmt)
     }
 }
 
+/* In the parallel rtx register series REGS, compute the register position for
+   given {BITPOS, BITSIZE}.  The results are stored into START_INDEX, END_INDEX,
+   LEFT_BITS and RIGHT_BITS.  */
+
+void
+query_position_in_parallel (HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+			    rtx regs, int &start_index, int &end_index,
+			    HOST_WIDE_INT &left_bits, HOST_WIDE_INT &right_bits)
+{
+  int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+  for (; cur_index < XVECLEN (regs, 0); cur_index++)
+    {
+      rtx slot = XVECEXP (regs, 0, cur_index);
+      HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+      machine_mode mode = GET_MODE (XEXP (slot, 0));
+      HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
+      if (off <= bitpos && off + size > bitpos)
+	{
+	  start_index = cur_index;
+	  left_bits = bitpos - off;
+	}
+      if (off + size >= bitpos + bitsize)
+	{
+	  end_index = cur_index;
+	  right_bits = off + size - (bitpos + bitsize);
+	  break;
+	}
+    }
+}
+
+/* Create a serial registers which start at FIRST_REG,
+   and SIZE is the total size of those registers.  */
+static rtx
+construct_reg_seq (HOST_WIDE_INT size, rtx first_reg)
+{
+  int nregs = size / UNITS_PER_WORD + (((size % UNITS_PER_WORD) != 0) ? 1 : 0);
+  rtx *tmps = XALLOCAVEC (rtx, nregs);
+  int regno = REGNO (first_reg);
+  machine_mode mode = word_mode;
+  HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
+  for (int i = 0; i < nregs; i++)
+    {
+      rtx reg = gen_rtx_REG (mode, regno + i);
+      rtx off = GEN_INT (word_size * i);
+      tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
+    }
+  return gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
+}
+
+static rtx
+get_incoming_element (tree arg, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+		      bool reversep, tree expr)
+{
+  rtx regs = DECL_INCOMING_RTL (arg);
+  bool has_padding = false;
+  if (REG_P (regs) && GET_MODE (regs) == BLKmode)
+    {
+      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (arg));
+      has_padding = (size % UNITS_PER_WORD) != 0;
+      regs = construct_reg_seq (size, regs);
+    }
+
+  if (GET_CODE (regs) != PARALLEL)
+    return NULL_RTX;
+
+  int start_index = -1;
+  int end_index = -1;
+  HOST_WIDE_INT left_bits = 0;
+  HOST_WIDE_INT right_bits = 0;
+  query_position_in_parallel (bitpos, bitsize, regs, start_index, end_index,
+			      left_bits, right_bits);
+
+  if (start_index < 0 || end_index < 0)
+    return NULL_RTX;
+
+  machine_mode expr_mode = TYPE_MODE (TREE_TYPE (expr));
+  /* Just need one reg for the access.  */
+  if (end_index != start_index)
+    return NULL_RTX;
+
+  rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+  /* Just need one reg for the access.  */
+  if (left_bits == 0 && right_bits == 0)
+    {
+      if (GET_MODE (reg) != expr_mode)
+	reg = gen_lowpart (expr_mode, reg);
+      return reg;
+    }
+
+  /* Need to extract bitfield part reg for the access.
+     left_bits != 0 or right_bits != 0 */
+  if (has_padding && end_index == XVECLEN (regs, 0) - 1)
+    return NULL_RTX;
+  scalar_int_mode imode;
+  if (!int_mode_for_mode (expr_mode).exists (&imode))
+    return NULL_RTX;
+
+  if (expr_mode != imode
+      && known_gt (GET_MODE_SIZE (GET_MODE (regs)), UNITS_PER_WORD))
+    return NULL_RTX;
+
+  machine_mode mode = GET_MODE (reg);
+  bool sgn = TYPE_UNSIGNED (TREE_TYPE (expr));
+  rtx bfld = extract_bit_field (reg, bitsize, left_bits, sgn, NULL_RTX, mode,
+				imode, reversep, NULL);
+
+  if (GET_MODE (bfld) != imode)
+    bfld = gen_lowpart (imode, bfld);
+
+  if (expr_mode == imode)
+    return bfld;
+
+  /* expr_mode != imode, e.g. SF != SI.  */
+  rtx result = gen_reg_rtx (imode);
+  emit_move_insn (result, bfld);
+  return gen_lowpart (expr_mode, result);
+}
+
+tree
+reference_alias_ptr_type (tree t);
+
+static void
+expand_ARG_PARTS (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree arg = gimple_call_arg (stmt, 0);
+  HOST_WIDE_INT offset = tree_to_shwi (gimple_call_arg (stmt, 1));
+  HOST_WIDE_INT size = tree_to_shwi (gimple_call_arg (stmt, 2));
+  int reversep = tree_to_shwi (gimple_call_arg (stmt, 3));
+  rtx sub_elem = get_incoming_element (arg, offset, size, reversep, lhs);
+  if (sub_elem)
+    {
+      rtx to_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+      if (to_rtx)
+	{
+	  gcc_assert (REG_P (to_rtx));
+	  emit_move_insn (to_rtx, sub_elem);
+	  return;
+	}
+    }
+  /* Fall back to normal expand method. */
+  if ((offset % BITS_PER_WORD == 0) && (size % BITS_PER_WORD == 0))
+    {
+      tree base = build_fold_addr_expr (arg);
+      tree type = reference_alias_ptr_type (arg);
+      tree off = build_int_cst (type, offset / BITS_PER_UNIT);
+      location_t loc = EXPR_LOCATION (arg);
+      tree rhs = fold_build2_loc (loc, MEM_REF, TREE_TYPE (lhs), base, off);
+      REF_REVERSE_STORAGE_ORDER (rhs) = reversep;
+      expand_assignment (lhs, rhs, false);
+    }
+  else
+    {
+      tree type = TREE_TYPE (lhs);
+      machine_mode mode = TYPE_MODE (type);
+      rtx op0
+	= expand_expr_real (arg, NULL, VOIDmode, EXPAND_NORMAL, NULL, true);
+      op0 = extract_bit_field (op0, size, offset, TYPE_UNSIGNED (type), NULL,
+			       mode, mode, reversep, NULL);
+      rtx dest = expand_expr (lhs, NULL, VOIDmode, EXPAND_WRITE);
+      emit_move_insn (dest, op0);
+    }
+}
+
 /* The size of an OpenACC compute dimension.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c14d30365c1..2bbf70dd6a1 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -510,6 +510,9 @@  DEF_INTERNAL_FN (PHI, 0, NULL)
    automatic variable.  */
 DEF_INTERNAL_FN (DEFERRED_INIT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 
+/* A function to extract elemet(s) from an aggregate argument in fsra. */
+DEF_INTERNAL_FN (ARG_PARTS, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
+
 /* DIM_SIZE and DIM_POS return the size of a particular compute
    dimension and the executing thread's position within that
    dimension.  DIM_POS is pure (and not const) so that it isn't
diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc
index aacc76f58b5..0bbb8940921 100644
--- a/gcc/tree-sra.cc
+++ b/gcc/tree-sra.cc
@@ -1508,7 +1508,8 @@  scan_function (void)
 	  tree t;
 	  unsigned i;
 
-	  if (gimple_code (stmt) != GIMPLE_CALL)
+	  if (gimple_code (stmt) != GIMPLE_CALL
+	      || sra_mode == SRA_MODE_FINAL_INTRA)
 	    walk_stmt_load_store_addr_ops (stmt, NULL, NULL, NULL,
 					   scan_visit_addr);
 
@@ -2767,12 +2768,22 @@  analyze_access_subtree (struct access *root, struct access *parent,
 	hole = true;
     }
 
+  auto check_rw = [] (struct access *root) -> bool {
+    if ((root->grp_scalar_read || root->grp_assignment_read)
+	&& (root->grp_scalar_write || root->grp_assignment_write))
+      return true;
+    if (sra_mode != SRA_MODE_FINAL_INTRA)
+      return false;
+    if ((root->grp_scalar_read || root->grp_assignment_read)
+	&& TREE_CODE (root->base) == PARM_DECL)
+      return true;
+    return false;
+  };
+
+  /* In fsra, parameter is scalarizable even no writing to it.  */
   if (allow_replacements && scalar && !root->first_child
       && (totally || !root->grp_total_scalarization)
-      && (totally
-	  || root->grp_hint
-	  || ((root->grp_scalar_read || root->grp_assignment_read)
-	      && (root->grp_scalar_write || root->grp_assignment_write))))
+      && (totally || root->grp_hint || check_rw (root)))
     {
       /* Always create access replacements that cover the whole access.
          For integral types this means the precision has to match.
@@ -2841,6 +2852,11 @@  analyze_access_subtree (struct access *root, struct access *parent,
     root->grp_covered = 1;
   else if (root->grp_write || comes_initialized_p (root->base))
     root->grp_unscalarized_data = 1; /* not covered and written to */
+
+  if (sra_mode == SRA_MODE_FINAL_INTRA && root->grp_write
+      && TREE_CODE (root->base) == PARM_DECL)
+    return false;
+
   return sth_created;
 }
 
@@ -3802,7 +3818,7 @@  generate_subtree_copies (struct access *access, tree agg,
 	      || access->offset + access->size > start_offset))
 	{
 	  tree expr, repl = get_access_replacement (access);
-	  gassign *stmt;
+	  gimple *stmt;
 
 	  expr = build_ref_for_model (loc, agg, access->offset - top_offset,
 				      access, gsi, insert_after);
@@ -3814,7 +3830,20 @@  generate_subtree_copies (struct access *access, tree agg,
 						 !insert_after,
 						 insert_after ? GSI_NEW_STMT
 						 : GSI_SAME_STMT);
-	      stmt = gimple_build_assign (repl, expr);
+	      if (sra_mode == SRA_MODE_FINAL_INTRA
+		  && TREE_CODE (access->base) == PARM_DECL
+		  && (access->grp_scalar_read || access->grp_assignment_read))
+		{
+		  gimple *call = gimple_build_call_internal (
+		    IFN_ARG_PARTS, 4, access->base,
+		    wide_int_to_tree (sizetype, access->offset),
+		    wide_int_to_tree (sizetype, access->size),
+		    wide_int_to_tree (sizetype, access->reverse));
+		  gimple_call_set_lhs (call, repl);
+		  stmt = call;
+		}
+	      else
+		stmt = gimple_build_assign (repl, expr);
 	    }
 	  else
 	    {
diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C b/gcc/testsuite/g++.target/powerpc/pr102024.C
index 769585052b5..c8995cae707 100644
--- a/gcc/testsuite/g++.target/powerpc/pr102024.C
+++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
@@ -5,7 +5,7 @@ 
 // Test that a zero-width bit field in an otherwise homogeneous aggregate
 // generates a psabi warning and passes arguments in GPRs.
 
-// { dg-final { scan-assembler-times {\mstd\M} 4 } }
+// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
 
 struct a_thing
 {
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073-1.c b/gcc/testsuite/gcc.target/powerpc/pr108073-1.c
new file mode 100644
index 00000000000..4892716e85f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073-1.c
@@ -0,0 +1,76 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2 -save-temps" } */
+
+typedef struct DF
+{
+  double a[4];
+  short s1;
+  short s2;
+  short s3;
+  short s4;
+} DF;
+typedef struct SF
+{
+  float a[4];
+  int i1;
+  int i2;
+} SF;
+
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 3 {target { lp64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlwz\M} {target { lp64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlhz\M} {target { lp64 && has_arch_pwr8 } } } } */
+
+#define NOIPA __attribute__ ((noipa))
+
+short NOIPA
+foo_hi (DF a, int flag)
+{
+  if (flag == 2)
+    return a.s2 + a.s3;
+  return 0;
+}
+int NOIPA
+foo_si (SF a, int flag)
+{
+  if (flag == 2)
+    return a.i2 + a.i1;
+  return 0;
+}
+double NOIPA
+foo_df (DF arg, int flag)
+{
+  if (flag == 2)
+    return arg.a[3];
+  else
+    return 0.0;
+}
+float NOIPA
+foo_sf (SF arg, int flag)
+{
+  if (flag == 2)
+    return arg.a[2];
+  return 0;
+}
+float NOIPA
+foo_sf1 (SF arg, int flag)
+{
+  if (flag == 2)
+    return arg.a[1];
+  return 0;
+}
+
+DF gdf = {{1.0, 2.0, 3.0, 4.0}, 1, 2, 3, 4};
+SF gsf = {{1.0f, 2.0f, 3.0f, 4.0f}, 1, 2};
+
+int
+main ()
+{
+  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 4.0
+	&& foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
+    __builtin_abort ();
+  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
+	&& foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c b/gcc/testsuite/gcc.target/powerpc/pr108073.c
new file mode 100644
index 00000000000..4e7feaa6810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
@@ -0,0 +1,74 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2 -save-temps" } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 5 {target { lp64 && { has_arch_pwr8 && be } } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 4 {target { lp64 && { has_arch_pwr8 && be } } } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 3 {target { lp64 && { has_arch_pwr8 && le } } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 {target { lp64 && { has_arch_pwr8 && le } } } } } */
+/* { dg-final { scan-assembler-times {\mfadds\M} 2 {target { lp64 && has_arch_pwr8 } } } } */
+
+#define NOIPA __attribute__ ((noipa))
+typedef struct X
+{
+  float x;
+  float y;
+} X;
+
+float NOIPA
+fooX (X y)
+{
+  y.x += 1;
+  return y.x + y.y;
+}
+
+typedef struct Y
+{
+  double a[4];
+  long l;
+} Y;
+
+double NOIPA
+fooY (Y arg)
+{
+  return arg.a[3];
+}
+
+typedef struct Z
+{
+  float a[4];
+  short l;
+} Z;
+
+float NOIPA
+fooZ (Z arg)
+{
+  return arg.a[3];
+}
+
+float NOIPA
+fooZ2 (Z arg)
+{
+  return arg.a[2];
+}
+
+X x = {1.0f, 2.0f};
+Y y = {1.0, 2.0, 3.0, 4.0, 1};
+Z z = {1.0f, 2.0f, 3.0f, 4.0f, 1};
+int
+main ()
+{
+  if (fooX (x) != 4.0f)
+    __builtin_abort ();
+
+  if (fooY (y) != 4.0)
+    __builtin_abort ();
+
+  if (fooZ (z) != 4.0f)
+    __builtin_abort ();
+
+  if (fooZ2 (z) != 3.0f)
+    __builtin_abort ();
+
+  return 0;
+}