@@ -417,6 +417,35 @@ public:
class svsra_impl : public function_base
{
+public:
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ /* Fold to svlsr/svasr if op1 is all zeros. */
+ tree op1 = gimple_call_arg (f.call, 0);
+ if (!integer_zerop (op1))
+ return NULL;
+ function_instance instance ("svlsr", functions::svlsr,
+ shapes::binary_uint_opt_n, MODE_n,
+ f.type_suffix_ids, GROUP_none, PRED_x);
+ if (!f.type_suffix (0).unsigned_p)
+ {
+ instance.base_name = "svasr";
+ instance.base = functions::svasr;
+ }
+ gcall *call = f.redirect_call (instance);
+ unsigned int element_bytes = f.type_suffix (0).element_bytes;
+ /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are
+ predicated intrinsics. */
+ gimple_call_set_arg (call, 0, build_ptrue (element_bytes));
+ /* For svsra, the shift amount (imm3) is uint64_t for all function types,
+ but for svlsr/svasr, imm3 has the same width as the function type. */
+ tree imm3 = gimple_call_arg (f.call, 2);
+ tree imm3_prec = wide_int_to_tree (scalar_types[f.type_suffix (0).vector_type],
+ wi::to_wide (imm3, element_bytes));
+ gimple_call_set_arg (call, 2, imm3_prec);
+ return call;
+ }
public:
rtx
expand (function_expander &e) const override
@@ -3456,6 +3456,21 @@ is_ptrue (tree v, unsigned int step)
&& vector_cst_all_same (v, step));
}
+/* Return a ptrue tree (type svbool_t) where the element width
+ is given by ELEMENT_BYTES.
+ For example, for ELEMENT_BYTES = 2, we get { 1, 0, 1, 0, ... }. */
+tree
+build_ptrue (unsigned int element_bytes)
+{
+ tree bool_type = scalar_types[VECTOR_TYPE_svbool_t];
+ tree svbool_type = acle_vector_types[0][VECTOR_TYPE_svbool_t];
+ tree_vector_builder builder (svbool_type, element_bytes, 1);
+ builder.quick_push (build_all_ones_cst (bool_type));
+ for (unsigned int i = 1; i < element_bytes; ++i)
+ builder.quick_push (build_zero_cst (bool_type));
+ return builder.build ();
+}
+
gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
gimple_stmt_iterator *gsi_in, gcall *call_in)
: function_call_info (gimple_location (call_in), instance, fndecl),
@@ -3572,17 +3587,8 @@ gimple_folder::fold_to_cstu (poly_uint64 val)
gimple *
gimple_folder::fold_to_ptrue ()
{
- tree svbool_type = TREE_TYPE (lhs);
- tree bool_type = TREE_TYPE (svbool_type);
- unsigned int element_bytes = type_suffix (0).element_bytes;
-
- /* The return type is svbool_t for all type suffixes, thus for b8 we
- want { 1, 1, 1, 1, ... }, for b16 we want { 1, 0, 1, 0, ... }, etc. */
- tree_vector_builder builder (svbool_type, element_bytes, 1);
- builder.quick_push (build_all_ones_cst (bool_type));
- for (unsigned int i = 1; i < element_bytes; ++i)
- builder.quick_push (build_zero_cst (bool_type));
- return gimple_build_assign (lhs, builder.build ());
+ return gimple_build_assign (lhs,
+ build_ptrue (type_suffix (0).element_bytes));
}
/* Fold the call to a PFALSE. */
@@ -829,6 +829,7 @@ extern tree acle_svprfop;
bool vector_cst_all_same (tree, unsigned int);
bool is_ptrue (tree, unsigned int);
const function_instance *lookup_fndecl (tree);
+tree build_ptrue (unsigned int);
/* Try to find a mode with the given mode_suffix_info fields. Return the
mode on success or MODE_none on failure. */
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t,
TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t,
z0 = svsra_n_s32 (z1, z2, 32),
z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_s32_zeroop1:
+** asr z0\.s, z1\.s, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t,
+ z0 = svsra_n_s32 (svdup_s32 (0), z1, 2),
+ z0 = svsra (svdup_s32 (0), z1, 2))
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t,
TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t,
z0 = svsra_n_s64 (z1, z2, 64),
z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_s64_zeroop1:
+** asr z0\.d, z1\.d, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t,
+ z0 = svsra_n_s64 (svdup_s64 (0), z1, 2),
+ z0 = svsra (svdup_s64 (0), z1, 2))
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_u32_tied2, svuint32_t,
TEST_UNIFORM_Z (sra_32_u32_untied, svuint32_t,
z0 = svsra_n_u32 (z1, z2, 32),
z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_u32_zeroop1:
+** lsr z0\.s, z1\.s, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_u32_zeroop1, svuint32_t,
+ z0 = svsra_n_u32 (svdup_u32 (0), z1, 2),
+ z0 = svsra (svdup_u32 (0), z1, 2))
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_u64_tied2, svuint64_t,
TEST_UNIFORM_Z (sra_64_u64_untied, svuint64_t,
z0 = svsra_n_u64 (z1, z2, 64),
z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_u64_zeroop1:
+** lsr z0\.d, z1\.d, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_u64_zeroop1, svuint64_t,
+ z0 = svsra_n_u64 (svdup_u64 (0), z1, 2),
+ z0 = svsra (svdup_u64 (0), z1, 2))
A common idiom in intrinsics loops is to have accumulator intrinsics in an unrolled loop with an accumulator initialized to zero at the beginning. Propagating the initial zero accumulator into the first iteration of the loop and simplifying the first accumulate instruction is a desirable transformation that we should teach GCC. Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros, producing the lower latency instructions LSR/ASR instead of USRA/SSRA. We implemented this optimization in svsra_impl::fold. Because svlsr/svasr are predicated intrinsics, we added a ptrue predicate. Additionally, the width of the shift amount (imm3) was adjusted to fit the function type. In order to create the ptrue predicate, a new helper function build_ptrue was added. We also refactored gimple_folder::fold_to_ptrue to use the new helper function. Tests were added to check the produced assembly for use of LSR/ASR. The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com> gcc/ * config/aarch64/aarch64-sve-builtins-sve2.cc (svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros. * config/aarch64/aarch64-sve-builtins.cc (build_ptrue): New function that returns a ptrue tree. (gimple_folder::fold_to_ptrue): Refactor to use build_ptrue. * config/aarch64/aarch64-sve-builtins.h: Declare build_ptrue. gcc/testsuite/ * gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test. * gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise. * gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise. * gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise. --- .../aarch64/aarch64-sve-builtins-sve2.cc | 29 +++++++++++++++++++ gcc/config/aarch64/aarch64-sve-builtins.cc | 28 +++++++++++------- gcc/config/aarch64/aarch64-sve-builtins.h | 1 + .../aarch64/sve2/acle/asm/sra_s32.c | 9 ++++++ .../aarch64/sve2/acle/asm/sra_s64.c | 9 ++++++ .../aarch64/sve2/acle/asm/sra_u32.c | 9 ++++++ .../aarch64/sve2/acle/asm/sra_u64.c | 9 ++++++ 7 files changed, 83 insertions(+), 11 deletions(-)