[pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]

Message ID	mpto7ffw8yv.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 839EA3858439 From: Richard Sandiford <richard.sandiford@arm.com> To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326] Date: Mon, 27 Nov 2023 14:45:44 +0000 Message-ID: <mpto7ffw8yv.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	[pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326] \| expand [pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]

diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index b61156302cf..ee81282a0be 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -2561,6 +2561,17 @@ vector_cst_all_same (tree v, unsigned int step) return true; } +/* Return true if V is a constant predicate that acts as a ptrue when + predicating STEP-byte elements. */ +bool +is_ptrue (tree v, unsigned int step) +{ + return (TREE_CODE (v) == VECTOR_CST + && TYPE_MODE (TREE_TYPE (v)) == VNx16BImode + && integer_nonzerop (VECTOR_CST_ENCODED_ELT (v, 0)) + && vector_cst_all_same (v, step)); +} + gimple_folder::gimple_folder (const function_instance &instance, tree fndecl, gimple_stmt_iterator *gsi_in, gcall *call_in) : function_call_info (gimple_location (call_in), instance, fndecl), @@ -2635,6 +2646,37 @@ gimple_folder::redirect_call (const function_instance &instance) return call; } +/* Redirect _z and _m calls to _x functions if the predicate is all-true. + This allows us to use unpredicated instructions, where available. */ +gimple * +gimple_folder::redirect_pred_x () +{ + if (pred != PRED_z && pred != PRED_m) + return nullptr; + + if (gimple_call_num_args (call) < 2) + return nullptr; + + tree lhs_type = TREE_TYPE (TREE_TYPE (fndecl)); + tree arg0_type = type_argument_type (TREE_TYPE (fndecl), 1); + tree arg1_type = type_argument_type (TREE_TYPE (fndecl), 2); + if (!VECTOR_TYPE_P (lhs_type) + || !VECTOR_TYPE_P (arg0_type) + || !VECTOR_TYPE_P (arg1_type)) + return nullptr; + + auto lhs_step = element_precision (lhs_type); + auto rhs_step = element_precision (arg1_type); + auto step = MAX (lhs_step, rhs_step); + if (!multiple_p (step, BITS_PER_UNIT) + || !is_ptrue (gimple_call_arg (call, 0), step / BITS_PER_UNIT)) + return nullptr; + + function_instance instance (*this); + instance.pred = PRED_x; + return redirect_call (instance); +} + /* Fold the call to constant VAL. */ gimple * gimple_folder::fold_to_cstu (poly_uint64 val) @@ -2707,6 +2749,10 @@ gimple_folder::fold () if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node) return NULL; + /* First try some simplifications that are common to many functions. */ + if (auto *call = redirect_pred_x ()) + return call; + return base->fold (*this); } diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h index d646df1c026..b9148c51b28 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.h +++ b/gcc/config/aarch64/aarch64-sve-builtins.h @@ -500,6 +500,8 @@ public: tree load_store_cookie (tree); gimple *redirect_call (const function_instance &); + gimple *redirect_pred_x (); + gimple *fold_to_cstu (poly_uint64); gimple *fold_to_pfalse (); gimple *fold_to_ptrue (); @@ -673,6 +675,7 @@ extern tree acle_svpattern; extern tree acle_svprfop; bool vector_cst_all_same (tree, unsigned int); +bool is_ptrue (tree, unsigned int); /* Return the ACLE type svbool_t. */ inline tree diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c new file mode 100644 index 00000000000..34604a8df6c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c @@ -0,0 +1,378 @@ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <arm_sve.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* +** add1: +** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s) +** ret +*/ +svint32_t +add1 (svint32_t x, svint32_t y) +{ + return svadd_z (svptrue_b8 (), x, y); +} + +/* +** add2: +** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s) +** ret +*/ +svint32_t +add2 (svint32_t x, svint32_t y) +{ + return svadd_z (svptrue_b16 (), x, y); +} + +/* +** add3: +** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s) +** ret +*/ +svint32_t +add3 (svint32_t x, svint32_t y) +{ + return svadd_z (svptrue_b32 (), x, y); +} + +/* +** add4: +** ... +** movprfx [^\n]+ +** ... +** ret +*/ +svint32_t +add4 (svint32_t x, svint32_t y) +{ + return svadd_z (svptrue_b64 (), x, y); +} + +/* +** add5: +** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s) +** ret +*/ +svint32_t +add5 (svint32_t x, svint32_t y) +{ + return svadd_m (svptrue_b8 (), x, y); +} + +/* +** add6: +** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s) +** ret +*/ +svint32_t +add6 (svint32_t x, svint32_t y) +{ + return svadd_m (svptrue_b16 (), x, y); +} + +/* +** add7: +** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s) +** ret +*/ +svint32_t +add7 (svint32_t x, svint32_t y) +{ + return svadd_m (svptrue_b32 (), x, y); +} + +/* +** add8: +** ptrue (p[0-7])\.d(?:, all)? +** add z0\.s, \1/m, z0\.s, z1\.s +** ret +*/ +svint32_t +add8 (svint32_t x, svint32_t y) +{ + return svadd_m (svptrue_b64 (), x, y); +} + +/* +** add9: +** ptrue (p[0-7])\.s(?:, all)? +** add z0\.h, \1/m, z0\.h, z1\.h +** ret +*/ +svint16_t +add9 (svint16_t x, svint16_t y) +{ + return svadd_m (svptrue_b32 (), x, y); +} + +/* +** and1: +** and z0\.s, z0\.s, #(?:0x)?1 +** ret +*/ +svint32_t +and1 (svint32_t x) +{ + return svand_z (svptrue_b8 (), x, 1); +} + +/* +** and2: +** and z0\.s, z0\.s, #(?:0x)?1 +** ret +*/ +svint32_t +and2 (svint32_t x) +{ + return svand_z (svptrue_b16 (), x, 1); +} + +/* +** and3: +** and z0\.s, z0\.s, #(?:0x)?1 +** ret +*/ +svint32_t +and3 (svint32_t x) +{ + return svand_z (svptrue_b32 (), x, 1); +} + +/* +** and4: +** (?!and z0\.s, z0\.s, #).* +** ret +*/ +svint32_t +and4 (svint32_t x) +{ + return svand_z (svptrue_b64 (), x, 1); +} + +/* +** and5: +** and z0\.s, z0\.s, #(?:0x)?1 +** ret +*/ +svint32_t +and5 (svint32_t x) +{ + return svand_m (svptrue_b8 (), x, 1); +} + +/* +** and6: +** and z0\.s, z0\.s, #(?:0x)?1 +** ret +*/ +svint32_t +and6 (svint32_t x) +{ + return svand_m (svptrue_b16 (), x, 1); +} + +/* +** and7: +** and z0\.s, z0\.s, #(?:0x)?1 +** ret +*/ +svint32_t +and7 (svint32_t x) +{ + return svand_m (svptrue_b32 (), x, 1); +} + +/* +** and8: +** (?!and z0\.s, z0\.s, #).* +** ret +*/ +svint32_t +and8 (svint32_t x) +{ + return svand_m (svptrue_b64 (), x, 1); +} + +/* +** and9: +** ( +** and p0\.b, p0/z, p1\.b, p1\.b +** | +** and p0\.b, p1/z, p0\.b, p0\.b +** ) +** ret +*/ +svbool_t +and9 (svbool_t x, svbool_t y) +{ + return svand_z (svptrue_b8 (), x, y); +} + +/* +** not1: +** ptrue (p[0-7])\.b(?:, all)? +** not z0\.s, \1/m, z1\.s +** ret +*/ +svint32_t +not1 (svint32_t x, svint32_t y) +{ + return svnot_m (x, svptrue_b8 (), y); +} + +/* +** cvt1: +** ptrue (p[0-7])\.b(?:, all)? +** fcvtzs z0\.s, \1/m, z0\.h +** ret +*/ +svint32_t +cvt1 (svfloat16_t x) +{ + return svcvt_s32_z (svptrue_b8 (), x); +} + +/* +** cvt2: +** ptrue (p[0-7])\.b(?:, all)? +** fcvtzs z0\.s, \1/m, z0\.h +** ret +*/ +svint32_t +cvt2 (svfloat16_t x) +{ + return svcvt_s32_z (svptrue_b16 (), x); +} + +/* +** cvt3: +** ptrue (p[0-7])\.b(?:, all)? +** fcvtzs z0\.s, \1/m, z0\.h +** ret +*/ +svint32_t +cvt3 (svfloat16_t x) +{ + return svcvt_s32_z (svptrue_b32 (), x); +} + +/* +** cvt4: +** ... +** movprfx [^\n]+ +** ... +** ret +*/ +svint32_t +cvt4 (svfloat16_t x) +{ + return svcvt_s32_z (svptrue_b64 (), x); +} + +/* +** cvt5: +** ptrue (p[0-7])\.b(?:, all)? +** fcvt z0\.h, \1/m, z0\.s +** ret +*/ +svfloat16_t +cvt5 (svfloat32_t x) +{ + return svcvt_f16_z (svptrue_b8 (), x); +} + +/* +** cvt6: +** ptrue (p[0-7])\.b(?:, all)? +** fcvt z0\.h, \1/m, z0\.s +** ret +*/ +svfloat16_t +cvt6 (svfloat32_t x) +{ + return svcvt_f16_z (svptrue_b16 (), x); +} + +/* +** cvt7: +** ptrue (p[0-7])\.b(?:, all)? +** fcvt z0\.h, \1/m, z0\.s +** ret +*/ +svfloat16_t +cvt7 (svfloat32_t x) +{ + return svcvt_f16_z (svptrue_b32 (), x); +} + +/* +** cvt8: +** ... +** movprfx [^\n]+ +** ... +** ret +*/ +svfloat16_t +cvt8 (svfloat32_t x) +{ + return svcvt_f16_z (svptrue_b64 (), x); +} + +/* +** cvt9: +** ptrue (p[0-7])\.b(?:, all)? +** scvtf z0\.h, \1/m, z0\.h +** ret +*/ +svfloat16_t +cvt9 (svint16_t x) +{ + return svcvt_f16_z (svptrue_b8 (), x); +} + +/* +** cvt10: +** ptrue (p[0-7])\.b(?:, all)? +** scvtf z0\.h, \1/m, z0\.h +** ret +*/ +svfloat16_t +cvt10 (svint16_t x) +{ + return svcvt_f16_z (svptrue_b16 (), x); +} + +/* +** cvt11: +** ... +** movprfx [^\n]+ +** ... +** ret +*/ +svfloat16_t +cvt11 (svint16_t x) +{ + return svcvt_f16_z (svptrue_b32 (), x); +} + +/* +** cvt12: +** ... +** movprfx [^\n]+ +** ... +** ret +*/ +svfloat16_t +cvt12 (svint16_t x) +{ + return svcvt_f16_z (svptrue_b64 (), x); +} + +#ifdef __cplusplus +} +#endif

[pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]

Commit Message

Patch