diff mbox series

[RFC,5/5] vect: Add accumulating-result pattern for lane-reducing operation

Message ID LV2PR01MB7839AE52B14A240021B2E057F7AF2@LV2PR01MB7839.prod.exchangelabs.com
State New
Headers show
Series [RFC,1/5] vect: Fix single_imm_use in tree_vect_patterns | expand

Commit Message

Feng Xue OS July 21, 2024, 9:15 a.m. UTC
This patch adds a pattern to fold a summation into the last operand of lane-
reducing operation when appropriate, which is a supplement to those operation-
specific patterns for dot-prod/sad/widen-sum.

  sum = lane-reducing-op(..., 0) + value;
=>
  sum = lane-reducing-op(..., value);

Thanks,
Feng
---
gcc/
	* tree-vect-patterns (vect_recog_lane_reducing_accum_pattern): New
	pattern function.
	(vect_vect_recog_func_ptrs): Add the new pattern function.
	* params.opt (vect-lane-reducing-accum-pattern): New parameter.

gcc/testsuite/
	* gcc.dg/vect/vect-reduc-accum-pattern.c
---
 gcc/params.opt                                |   4 +
 .../gcc.dg/vect/vect-reduc-accum-pattern.c    |  61 ++++++++++
 gcc/tree-vect-patterns.cc                     | 106 ++++++++++++++++++
 3 files changed, 171 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-accum-pattern.c
diff mbox series

Patch

From 94d34da8de2fd479c81e8398544466e6ffe7fdfc Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Wed, 22 May 2024 17:08:32 +0800
Subject: [PATCH 5/5] vect: Add accumulating-result pattern for lane-reducing
 operation

This patch adds a pattern to fold a summation into the last operand of lane-
reducing operation when appropriate, which is a supplement to those operation-
specific patterns for dot-prod/sad/widen-sum.

  sum = lane-reducing-op(..., 0) + value;
=>
  sum = lane-reducing-op(..., value);

2024-05-22 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-patterns (vect_recog_lane_reducing_accum_pattern): New
	pattern function.
	(vect_vect_recog_func_ptrs): Add the new pattern function.
	* params.opt (vect-lane-reducing-accum-pattern): New parameter.

gcc/testsuite/
	* gcc.dg/vect/vect-reduc-accum-pattern.c
---
 gcc/params.opt                                |   4 +
 .../gcc.dg/vect/vect-reduc-accum-pattern.c    |  61 ++++++++++
 gcc/tree-vect-patterns.cc                     | 106 ++++++++++++++++++
 3 files changed, 171 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-accum-pattern.c

diff --git a/gcc/params.opt b/gcc/params.opt
index c17ba17b91b..b94bdc26cbd 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1198,6 +1198,10 @@  The maximum factor which the loop vectorizer applies to the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=vect-lane-reducing-accum-pattern=
+Common Joined UInteger Var(param_vect_lane_reducing_accum_pattern) Init(2) IntegerRange(0, 2) Param Optimization
+Allow pattern of combining plus into lane reducing operation or not. If value is 2, allow this for all statements, or if 1, only for reduction statement, otherwise, disable it.
+
 -param=vrp-block-limit=
 Common Joined UInteger Var(param_vrp_block_limit) Init(150000) Optimization Param
 Maximum number of basic blocks before VRP switches to a fast model with less memory requirements.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-accum-pattern.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-accum-pattern.c
new file mode 100644
index 00000000000..80a2c4f047e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-accum-pattern.c
@@ -0,0 +1,61 @@ 
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#define FN(name, S1, S2)			\
+S1 int __attribute__ ((noipa))			\
+name (S1 int res,				\
+      S2 char *restrict a,			\
+      S2 char *restrict b,			\
+      S2 char *restrict c,			\
+      S2 char *restrict d)			\
+{						\
+  for (int i = 0; i < N; i++)			\
+    res += a[i] * b[i];				\
+						\
+  asm volatile ("" ::: "memory");		\
+  for (int i = 0; i < N; ++i)			\
+    res += (a[i] * b[i] + c[i] * d[i]) << 3;	\
+						\
+  return res;					\
+}
+
+FN(f1_vec, signed, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec, signed, signed)
+#pragma GCC pop_options
+
+#define BASE2 ((signed int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  signed char a[N], b[N];
+  signed char c[N], d[N];
+
+#pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE2 + i * 6;
+      d[i] = BASE2 + OFFSET + i * 5;
+    }
+
+  if (f1_vec (0x12345, a, b, c, d) != f1_novec (0x12345, a, b, c, d))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vect_recog_lane_reducing_accum_pattern: detected" "vect" { target { vect_sdot_qi } } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index bb037af0b68..9a6b16532e4 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1490,6 +1490,111 @@  vect_recog_abd_pattern (vec_info *vinfo,
   return vect_convert_output (vinfo, stmt_vinfo, out_type, stmt, vectype_out);
 }
 
+/* Function vect_recog_lane_reducing_accum_pattern
+
+   Try to fold a summation into the last operand of lane-reducing operation.
+
+   sum = lane-reducing-op(..., 0) + value;
+
+   A lane-reducing operation contains two aspects: main primitive operation
+   and appendant result-accumulation.  Pattern matching for the basic aspect
+   is handled in specific pattern for dot-prod/sad/widen-sum respectively.
+   The function is in charge of the other aspect.
+
+   Input:
+
+   * STMT_VINFO: The stmt from which the pattern search begins.
+
+   Output:
+
+   * TYPE_OUT: The type of the output of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence of
+   stmts that constitute the pattern, that is:
+	sum = lane-reducing-op(..., value);
+*/
+
+static gimple *
+vect_recog_lane_reducing_accum_pattern (vec_info *vinfo,
+					stmt_vec_info stmt_vinfo,
+					tree *type_out)
+{
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_formed))
+    return NULL;
+
+  if (param_vect_lane_reducing_accum_pattern == 0)
+    return NULL;
+
+  if (param_vect_lane_reducing_accum_pattern == 1)
+    {
+      /* Only allow combing for loop reduction statement.  */
+      if (STMT_VINFO_REDUC_IDX (stmt_vinfo) < 0)
+	return NULL;
+    }
+
+  gimple *last_stmt = stmt_vinfo->stmt;
+
+  if (!is_gimple_assign (last_stmt)
+      || gimple_assign_rhs_code (last_stmt) != PLUS_EXPR)
+    return NULL;
+
+  gimple *lane_reducing_stmt = NULL;
+  tree sum_oprnd = NULL_TREE;
+
+  for (unsigned i = 0; i < 2; i++)
+    {
+      tree oprnd = gimple_op (last_stmt, i + 1);
+      vect_unpromoted_value unprom;
+      bool single_use_p = true;
+
+      if (!vect_look_through_possible_promotion (vinfo, oprnd, &unprom,
+						 &single_use_p)
+	  || !single_use_p)
+	continue;
+
+      stmt_vec_info oprnd_vinfo = vect_get_internal_def (vinfo, unprom.op);
+
+      if (!oprnd_vinfo)
+	continue;
+
+      gimple *stmt = oprnd_vinfo->stmt;
+
+      if (lane_reducing_stmt_p (stmt)
+	  && integer_zerop (gimple_op (stmt, gimple_num_ops (stmt) - 1)))
+	{
+	  lane_reducing_stmt = stmt;
+	  sum_oprnd = gimple_op (last_stmt,  2 - i);
+	  break;
+	}
+    }
+
+  if (!lane_reducing_stmt)
+    return NULL;
+
+  tree type = TREE_TYPE (gimple_get_lhs (last_stmt));
+
+  *type_out = get_vectype_for_scalar_type (vinfo, type);
+  if (!*type_out)
+    return NULL;
+
+  vect_pattern_detected ("vect_recog_lane_reducing_accum_pattern", last_stmt);
+
+  tree var = vect_recog_temp_ssa_var (type, NULL);
+  enum tree_code code = gimple_assign_rhs_code (lane_reducing_stmt);
+  gimple *pattern_stmt;
+
+  if (code == WIDEN_SUM_EXPR)
+    pattern_stmt = gimple_build_assign (var, code,
+					gimple_op (lane_reducing_stmt, 1),
+					sum_oprnd);
+  else
+    pattern_stmt = gimple_build_assign (var, code,
+					gimple_op (lane_reducing_stmt, 1),
+					gimple_op (lane_reducing_stmt, 2),
+					sum_oprnd);
+  return pattern_stmt;
+}
+
 /* Recognize an operation that performs ORIG_CODE on widened inputs,
    so that it can be treated as though it had the form:
 
@@ -7084,6 +7189,7 @@  static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_dot_prod_pattern, "dot_prod" },
   { vect_recog_sad_pattern, "sad" },
   { vect_recog_widen_sum_pattern, "widen_sum" },
+  { vect_recog_lane_reducing_accum_pattern, "lane_reducing_accum" },
 
   { vect_recog_bitfield_ref_pattern, "bitfield_ref" },
   { vect_recog_bit_insert_pattern, "bit_insert" },
-- 
2.17.1