diff mbox series

SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.

Message ID F732CE83-A042-4A5B-B1AA-F5DBE624CE0C@nvidia.com
State New
Headers show
Series SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr. | expand

Commit Message

Jennifer Schmitz Oct. 18, 2024, 7:30 a.m. UTC
A common idiom in intrinsics loops is to have accumulator intrinsics
in an unrolled loop with an accumulator initialized to zero at the beginning.
Propagating the initial zero accumulator into the first iteration
of the loop and simplifying the first accumulate instruction is a
desirable transformation that we should teach GCC.
Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros,
producing the lower latency instructions LSR/ASR instead of USRA/SSRA.
We implemented this optimization in svsra_impl::fold.
Because svlsr/svasr are predicated intrinsics, we added a ptrue
predicate. Additionally, the width of the shift amount (imm3) was
adjusted to fit the function type.
In order to create the ptrue predicate, a new helper function
build_ptrue was added. We also refactored gimple_folder::fold_to_ptrue
to use the new helper function.

Tests were added to check the produced assembly for use of LSR/ASR.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	* config/aarch64/aarch64-sve-builtins-sve2.cc
	(svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros.
	* config/aarch64/aarch64-sve-builtins.cc (build_ptrue): New
	function that returns a ptrue tree.
	(gimple_folder::fold_to_ptrue): Refactor to use build_ptrue.
	* config/aarch64/aarch64-sve-builtins.h: Declare build_ptrue.

gcc/testsuite/
	* gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test.
	* gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise.
	* gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise.
	* gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise.
---
 .../aarch64/aarch64-sve-builtins-sve2.cc      | 29 +++++++++++++++++++
 gcc/config/aarch64/aarch64-sve-builtins.cc    | 28 +++++++++++-------
 gcc/config/aarch64/aarch64-sve-builtins.h     |  1 +
 .../aarch64/sve2/acle/asm/sra_s32.c           |  9 ++++++
 .../aarch64/sve2/acle/asm/sra_s64.c           |  9 ++++++
 .../aarch64/sve2/acle/asm/sra_u32.c           |  9 ++++++
 .../aarch64/sve2/acle/asm/sra_u64.c           |  9 ++++++
 7 files changed, 83 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index 6a20a613f83..0990918cc45 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -417,6 +417,35 @@  public:
 
 class svsra_impl : public function_base
 {
+public:
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    /* Fold to svlsr/svasr if op1 is all zeros.  */
+    tree op1 = gimple_call_arg (f.call, 0);
+    if (!integer_zerop (op1))
+      return NULL;
+    function_instance instance ("svlsr", functions::svlsr,
+				shapes::binary_uint_opt_n, MODE_n,
+				f.type_suffix_ids, GROUP_none, PRED_x);
+    if (!f.type_suffix (0).unsigned_p)
+      {
+	instance.base_name = "svasr";
+	instance.base = functions::svasr;
+      }
+    gcall *call = f.redirect_call (instance);
+    unsigned int element_bytes = f.type_suffix (0).element_bytes;
+    /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are
+       predicated intrinsics.  */
+    gimple_call_set_arg (call, 0, build_ptrue (element_bytes));
+    /* For svsra, the shift amount (imm3) is uint64_t for all function types,
+       but for svlsr/svasr, imm3 has the same width as the function type.  */
+    tree imm3 = gimple_call_arg (f.call, 2);
+    tree imm3_prec = wide_int_to_tree (scalar_types[f.type_suffix (0).vector_type],
+				       wi::to_wide (imm3, element_bytes));
+    gimple_call_set_arg (call, 2, imm3_prec);
+    return call;
+  }
 public:
   rtx
   expand (function_expander &e) const override
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index e7c703c987e..945e9818f4e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -3456,6 +3456,21 @@  is_ptrue (tree v, unsigned int step)
 	  && vector_cst_all_same (v, step));
 }
 
+/* Return a ptrue tree (type svbool_t) where the element width
+   is given by ELEMENT_BYTES.
+   For example, for ELEMENT_BYTES = 2, we get { 1, 0, 1, 0, ... }.  */
+tree
+build_ptrue (unsigned int element_bytes)
+{
+  tree bool_type = scalar_types[VECTOR_TYPE_svbool_t];
+  tree svbool_type = acle_vector_types[0][VECTOR_TYPE_svbool_t];
+  tree_vector_builder builder (svbool_type, element_bytes, 1);
+  builder.quick_push (build_all_ones_cst (bool_type));
+  for (unsigned int i = 1; i < element_bytes; ++i)
+    builder.quick_push (build_zero_cst (bool_type));
+  return builder.build ();
+}
+
 gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
 			      gimple_stmt_iterator *gsi_in, gcall *call_in)
   : function_call_info (gimple_location (call_in), instance, fndecl),
@@ -3572,17 +3587,8 @@  gimple_folder::fold_to_cstu (poly_uint64 val)
 gimple *
 gimple_folder::fold_to_ptrue ()
 {
-  tree svbool_type = TREE_TYPE (lhs);
-  tree bool_type = TREE_TYPE (svbool_type);
-  unsigned int element_bytes = type_suffix (0).element_bytes;
-
-  /* The return type is svbool_t for all type suffixes, thus for b8 we
-     want { 1, 1, 1, 1, ... }, for b16 we want { 1, 0, 1, 0, ... }, etc.  */
-  tree_vector_builder builder (svbool_type, element_bytes, 1);
-  builder.quick_push (build_all_ones_cst (bool_type));
-  for (unsigned int i = 1; i < element_bytes; ++i)
-    builder.quick_push (build_zero_cst (bool_type));
-  return gimple_build_assign (lhs, builder.build ());
+  return gimple_build_assign (lhs,
+			      build_ptrue (type_suffix (0).element_bytes));
 }
 
 /* Fold the call to a PFALSE.  */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
index 645e56badbe..c5524b9664f 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -829,6 +829,7 @@  extern tree acle_svprfop;
 bool vector_cst_all_same (tree, unsigned int);
 bool is_ptrue (tree, unsigned int);
 const function_instance *lookup_fndecl (tree);
+tree build_ptrue (unsigned int);
 
 /* Try to find a mode with the given mode_suffix_info fields.  Return the
    mode on success or MODE_none on failure.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
index ac992dc7b1c..86cf4bd8137 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
@@ -91,3 +91,12 @@  TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t,
 TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t,
 		z0 = svsra_n_s32 (z1, z2, 32),
 		z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_s32_zeroop1:
+**	asr	z0\.s, z1\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t,
+		z0 = svsra_n_s32 (svdup_s32 (0), z1, 2),
+		z0 = svsra (svdup_s32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
index 9ea5657ab88..7b39798ba1d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
@@ -91,3 +91,12 @@  TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t,
 TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t,
 		z0 = svsra_n_s64 (z1, z2, 64),
 		z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_s64_zeroop1:
+**	asr	z0\.d, z1\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t,
+		z0 = svsra_n_s64 (svdup_s64 (0), z1, 2),
+		z0 = svsra (svdup_s64 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
index 090245153f7..001e09ca78d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
@@ -91,3 +91,12 @@  TEST_UNIFORM_Z (sra_32_u32_tied2, svuint32_t,
 TEST_UNIFORM_Z (sra_32_u32_untied, svuint32_t,
 		z0 = svsra_n_u32 (z1, z2, 32),
 		z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_u32_zeroop1:
+**	lsr	z0\.s, z1\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (sra_2_u32_zeroop1, svuint32_t,
+		z0 = svsra_n_u32 (svdup_u32 (0), z1, 2),
+		z0 = svsra (svdup_u32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
index ff21c368b72..780cf7a7ff6 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
@@ -91,3 +91,12 @@  TEST_UNIFORM_Z (sra_64_u64_tied2, svuint64_t,
 TEST_UNIFORM_Z (sra_64_u64_untied, svuint64_t,
 		z0 = svsra_n_u64 (z1, z2, 64),
 		z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_u64_zeroop1:
+**	lsr	z0\.d, z1\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (sra_2_u64_zeroop1, svuint64_t,
+		z0 = svsra_n_u64 (svdup_u64 (0), z1, 2),
+		z0 = svsra (svdup_u64 (0), z1, 2))