forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

Message ID	12b3b03b-b2d2-7d93-fa53-68d0cf07de5f@linux.ibm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org C237F3857816 To: GCC Patches <gcc-patches@gcc.gnu.org> Subject: [PATCH] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398] Message-ID: <12b3b03b-b2d2-7d93-fa53-68d0cf07de5f@linux.ibm.com> Date: Fri, 7 May 2021 10:40:21 +0800 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Thunderbird/78.10.0 MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------BF54759ABA102CC25C246F1B" Content-Language: en-US Precedence: list From: "Kewen.Lin via Gcc-patches" <gcc-patches@gcc.gnu.org> Reply-To: "Kewen.Lin" <linkw@linux.ibm.com> Cc: Jakub Jelinek <jakub@redhat.com>, Richard Sandiford <richard.sandiford@arm.com>, Bill Schmidt <wschmidt@linux.ibm.com>, Richard Guenther <rguenther@suse.de>, Segher Boessenkool <segher@kernel.crashing.org> Errors-To: gcc-patches-bounces@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces@gcc.gnu.org>
Series	forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398] \| expand forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c new file mode 100644 index 00000000000..987d6db999c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c @@ -0,0 +1,124 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vsx_hw } */ +/* { dg-options "-O2 -mvsx" } */ + +#include "vec-perm-ctor.h" + +#include <stdlib.h> + +int +main () +{ + du a_du = 100ULL; + du b_du = 200ULL; + + di a_di = -100; + di b_di = 200; + + df a_df = 10.0; + df b_df = 20.0; + + si a_si = 12; + si b_si = -25; + si c_si = -37; + si d_si = 50; + + sf a_sf = 30.0f; + sf b_sf = 40.0f; + sf c_sf = 50.0f; + sf d_sf = 60.0f; + + hu a_hu = 10; + hu b_hu = 20; + hu c_hu = 30; + hu d_hu = 40; + hu e_hu = 50; + hu f_hu = 60; + hu g_hu = 70; + hu h_hu = 80; + + qi a_qi = 10; + qi b_qi = 20; + qi c_qi = -30; + qi d_qi = 40; + qi e_qi = -50; + qi f_qi = 60; + qi g_qi = 70; + qi h_qi = -80; + + v2du res1 = test_ctor_ctor_same_du (a_du, b_du); + if (res1[0] != a_du || res1[1] != b_du) + abort (); + + v2df res2 = test_ctor_ctor_same_df (a_df, b_df); + if (res2[0] != a_df || res2[1] != b_df) + abort (); + + v4si res3 = test_ctor_ctor_same_si (a_si, b_si, c_si, d_si); + if (res3[0] != a_si || res3[1] != b_si || res3[2] != c_si || res3[3] != d_si) + abort (); + + v4sf res4 = test_ctor_ctor_same_sf (a_sf, b_sf, c_sf, d_sf); + if (res4[0] != a_sf || res4[1] != b_sf || res4[2] != c_sf || res4[3] != d_sf) + abort (); + + v8hu res5 + = test_ctor_ctor_same_hu (a_hu, b_hu, c_hu, d_hu, e_hu, f_hu, g_hu, h_hu); + + if (res5[0] != a_hu || res5[1] != b_hu || res5[2] != c_hu || res5[3] != d_hu + || res5[4] != e_hu || res5[5] != f_hu || res5[6] != g_hu + || res5[7] != h_hu) + abort (); + + v16qi res6 + = test_ctor_ctor_same_qi (a_qi, b_qi, c_qi, d_qi, e_qi, f_qi, g_qi, h_qi); + + if (res6[0] != a_qi || res6[1] != b_qi || res6[2] != c_qi || res6[3] != d_qi + || res6[4] != a_qi || res6[5] != b_qi || res6[6] != c_qi + || res6[7] != d_qi || res6[8] != e_qi || res6[9] != f_qi + || res6[10] != g_qi || res6[11] != h_qi || res6[12] != e_qi + || res6[13] != f_qi || res6[14] != g_qi || res6[15] != h_qi) + abort (); + + v2du res7 = test_ctor_cst_same_du (a_du, b_du); + if (res7[0] != a_du || res7[1] != 100) + abort (); + + v4sf res8 = test_ctor_cst_same_sf (a_sf, b_sf); + if (res8[0] != a_sf || res8[1] != 2.0f || res8[2] != b_sf || res8[3] != 4.0f) + abort (); + + v2df res9 = test_ctor_cst_same_df (a_df, b_df); + if (res9[0] != b_df || res9[1] != 200.0) + abort (); + + v4si res10 = test_cst_ctor_same_si (a_si, b_si); + if (res10[0] != 1 || res10[1] != 3 || res10[2] != a_si || res10[3] != b_si) + abort (); + + v2di res11 = test_ctor_cst_diff_di_si (a_di, b_di); + /* Need to take care of the endianness since the function converts vector + const to one different vector type (element size), the endianness + determines the reinterpreted layout. Same reason for res12 below. */ + if (res11[0] != -100 || +#ifdef __LITTLE_ENDIAN__ + res11[1] != 3 +#else + res11[1] != 0x300000000LL +#endif + ) + abort (); + + v2du res12 = test_cst_ctor_diff_sf_du (a_du, b_du); + if ( +#ifdef __LITTLE_ENDIAN__ + res12[0] != 0x400000003f800000ULL +#else + res12[0] != 0x3f80000040000000ULL +#endif + || res12[1] != 100) + abort (); + + return 0; +} + diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.c b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.c new file mode 100644 index 00000000000..cc59e60035f --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O2 -mvsx -fdump-tree-optimized" } */ + +/* To test all permutations fed by CTOR and CST can be optimized away. */ + +#include "vec-perm-ctor.h" + +/* { dg-final { scan-tree-dump-not "VIEW_CONVERT_EXPR" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.h b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.h new file mode 100644 index 00000000000..18782701e51 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.h @@ -0,0 +1,163 @@ +#include "altivec.h" + +typedef vector unsigned long long v2du; +typedef vector signed long long v2di; +typedef vector unsigned int v4su; +typedef vector signed int v4si; +typedef vector unsigned short v8hu; +typedef vector signed short v8hi; +typedef vector unsigned char v16qu; +typedef vector signed char v16qi; +typedef vector double v2df; +typedef vector float v4sf; + +typedef unsigned long long du; +typedef signed long long di; +typedef unsigned int su; +typedef signed int si; +typedef unsigned short hu; +typedef signed short hi; +typedef unsigned char qu; +typedef signed char qi; +typedef double df; +typedef float sf; + +/* To test whether we can optimize vector permutation away when + the two inputs are same type CTOR or one input is CTOR and the + other is CST. */ + +/* CTOR + CTOR part (only same type supported). */ + +/* Test both operands are same type CTOR (type unsigned long long). */ +__attribute__ ((noipa)) v2du +test_ctor_ctor_same_du (du a, du b) +{ + v2du v1 = {a, 0}; + v2du v2 = {b, 0}; + v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* Test both operands are same type CTOR (type double). */ +__attribute__ ((noipa)) v2df +test_ctor_ctor_same_df (df a, df b) +{ + v2df v1 = {0.0, a}; + v2df v2 = {0.0, b}; + v16qu vc = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + v2df vres = (v2df) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* Test both operands are same type CTOR (type signed int). */ +__attribute__ ((noipa)) v4si +test_ctor_ctor_same_si (si a, si b, si c, si d) +{ + v4si v1 = {0, a, 0, c}; + v4si v2 = {0, b, 0, d}; + v16qu vc = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}; + v4si vres = (v4si) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* Test both operands are same type CTOR (type float). */ +__attribute__ ((noipa)) v4sf +test_ctor_ctor_same_sf (sf a, sf b, sf c, sf d) +{ + v4sf v1 = {c, 0.0f, d, 0.0f}; + v4sf v2 = {a, 0.0f, b, 0.0f}; + v16qu vc = {16, 17, 18, 19, 24, 25, 26, 27, 0, 1, 2, 3, 8, 9, 10, 11}; + v4sf vres = (v4sf) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* Test both operands are same type CTOR (type unsigned short). */ +__attribute__ ((noipa)) v8hu +test_ctor_ctor_same_hu (hu a, hu b, hu c, hu d, hu e, hu f, hu g, hu h) +{ + v8hu v1 = {0, a, 0, b, 0, c, 0, d}; + v8hu v2 = {0, e, 0, f, 0, g, 0, h}; + v16qu vc = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}; + v8hu vres = (v8hu) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* Test both operands are same type CTOR (type signed char). */ +__attribute__ ((noipa)) v16qi +test_ctor_ctor_same_qi (qi a, qi b, qi c, qi d, qi e, qi f, qi g, qi h) +{ + v16qi v1 = {0, a, 0, b, 0, c, 0, d, 0, a, 0, b, 0, c, 0, d}; + v16qi v2 = {0, e, 0, f, 0, g, 0, h, 0, e, 0, f, 0, g, 0, h}; + v16qu vc = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + v16qi vres = (v16qi) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* CTOR + CST part (same type). */ + +__attribute__ ((noipa)) v2du +test_ctor_cst_same_du (du a, du b) +{ + v2du v1 = {a, b}; + v2du v2 = {100, 200}; + v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +__attribute__ ((noipa)) v4sf +test_ctor_cst_same_sf (sf a, sf b) +{ + v4sf v1 = {0.0f, a, 0.0f, b}; + v4sf v2 = {1.0f, 2.0f, 3.0f, 4.0f}; + v16qu vc = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}; + v4sf vres = (v4sf) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* CST + CTOR part (same type). */ + +__attribute__ ((noipa)) v2df +test_ctor_cst_same_df (df a, df b) +{ + v2df v1 = {a, b}; + v2df v2 = {100.0, 200.0}; + v16qu vc = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + v2df vres = (v2df) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +__attribute__ ((noipa)) v4si +test_cst_ctor_same_si (si a, si b) +{ + v4si v1 = {a, 0, b, 0}; + v4si v2 = {1, 2, 3, 4}; + v16qu vc = {16, 17, 18, 19, 24, 25, 26, 27, 0, 1, 2, 3, 8, 9, 10, 11}; + v4si vres = (v4si) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* CTOR + CST part (different types). */ + +__attribute__ ((noipa)) v2di +test_ctor_cst_diff_di_si (di a, di b) +{ + v2di v1 = {a, b}; + v4si v2 = {3, 0, 4, 0}; + v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + v2di vres = (v2di) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} + +/* CST + CTOR part (different types). */ + +__attribute__ ((noipa)) v2du +test_cst_ctor_diff_sf_du (du a, du b) +{ + v4sf v1 = {1.0f, 2.0f, 3.0f, 4.0f}; + v2du v2 = {a, b}; + v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc); + return vres; +} diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index 0706fd862de..41717bcdb3d 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -215,17 +215,21 @@ fwprop_invalidate_lattice (tree name) lattice[SSA_NAME_VERSION (name)] = NULL_TREE; } - /* Get the statement we can propagate from into NAME skipping trivial copies. Returns the statement which defines the propagation source or NULL_TREE if there is no such one. If SINGLE_USE_ONLY is set considers only sources which have a single use chain up to NAME. If SINGLE_USE_P is non-null, it is set to whether the chain to NAME is a single use chain - or not. SINGLE_USE_P is not written to if SINGLE_USE_ONLY is set. */ + or not. SINGLE_USE_P is not written to if SINGLE_USE_ONLY is set. + If VIEW_CONV_PROP is non-null, it means we can aggressively take + VIEW_CONVERT_EXPR as trivial copies and probably get more + propagation chances. If the chain to NAME has one and more + VIEW_CONVERT_EXPR statements, *VIEW_CONV_PROP is set as true. */ static gimple * -get_prop_source_stmt (tree name, bool single_use_only, bool *single_use_p) +get_prop_source_stmt (tree name, bool single_use_only, bool *single_use_p, + bool *view_conv_prop = NULL) { bool single_use = true; @@ -246,6 +250,15 @@ get_prop_source_stmt (tree name, bool single_use_only, bool *single_use_p) /* If def_stmt is a simple copy, continue looking. */ if (gimple_assign_rhs_code (def_stmt) == SSA_NAME) name = gimple_assign_rhs1 (def_stmt); + else if (view_conv_prop + && gimple_assign_rhs_code (def_stmt) == VIEW_CONVERT_EXPR) + { + tree view = gimple_assign_rhs1 (def_stmt); + name = TREE_OPERAND (view, 0); + if (TREE_CODE (name) != SSA_NAME) + return NULL; + *view_conv_prop = true; + } else { if (!single_use_only && single_use_p) @@ -301,11 +314,12 @@ can_propagate_from (gimple *def_stmt) NAME. The chain is linked via the first operand of the defining statements. If NAME was replaced in its only use then this function can be used to clean up dead stmts. The function handles already released SSA - names gracefully. + names gracefully. If VIEW_CONV_PROP is true, it means we can + aggressively take VIEW_CONVERT_EXPR as trivial copies. Returns true if cleanup-cfg has to run. */ static bool -remove_prop_source_from_use (tree name) +remove_prop_source_from_use (tree name, bool view_conv_prop = false) { gimple_stmt_iterator gsi; gimple *stmt; @@ -332,7 +346,11 @@ remove_prop_source_from_use (tree name) fwprop_invalidate_lattice (gimple_get_lhs (stmt)); release_defs (stmt); - name = is_gimple_assign (stmt) ? gimple_assign_rhs1 (stmt) : NULL_TREE; + if (!is_gimple_assign (stmt)) + break; + name = gimple_assign_rhs1 (stmt); + if (view_conv_prop && TREE_CODE (name) == VIEW_CONVERT_EXPR) + name = TREE_OPERAND (name, 0); } while (name && TREE_CODE (name) == SSA_NAME); return cfg_changed; @@ -2115,7 +2133,7 @@ is_combined_permutation_identity (tree mask1, tree mask2) /* Combine a shuffle with its arguments. Returns 1 if there were any changes made, 2 if cfg-cleanup needs to run. Else it returns 0. */ - + static int simplify_permutation (gimple_stmt_iterator *gsi) { @@ -2124,6 +2142,7 @@ simplify_permutation (gimple_stmt_iterator *gsi) tree op0, op1, op2, op3, arg0, arg1; enum tree_code code; bool single_use_op0 = false; + bool view_conv_prop_op0 = false; gcc_checking_assert (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR); @@ -2141,7 +2160,8 @@ simplify_permutation (gimple_stmt_iterator *gsi) } else if (TREE_CODE (op0) == SSA_NAME) { - def_stmt = get_prop_source_stmt (op0, false, &single_use_op0); + def_stmt = get_prop_source_stmt (op0, false, &single_use_op0, + &view_conv_prop_op0); if (!def_stmt || !can_propagate_from (def_stmt)) return 0; @@ -2151,6 +2171,11 @@ simplify_permutation (gimple_stmt_iterator *gsi) else return 0; + /* If VIEW_CONVERT_EXPR propagation happens, we only handles the input codes + are CONSTRUCTOR + CONSTRUCTOR or CONSTRUCTOR + VECTOR_CST for now. */ + if (view_conv_prop_op0 && (code != CONSTRUCTOR && code != VECTOR_CST)) + return 0; + /* Two consecutive shuffles. */ if (code == VEC_PERM_EXPR) { @@ -2179,7 +2204,10 @@ simplify_permutation (gimple_stmt_iterator *gsi) { tree opt; bool ret = false; - if (op0 != op1) + bool view_conv_prop_op1 = false; + enum tree_code code2 = ERROR_MARK; + bool same_op = (op0 == op1); + if (!same_op) { if (TREE_CODE (op0) == SSA_NAME && !single_use_op0) return 0; @@ -2188,9 +2216,8 @@ simplify_permutation (gimple_stmt_iterator *gsi) arg1 = op1; else if (TREE_CODE (op1) == SSA_NAME) { - enum tree_code code2; - - gimple *def_stmt2 = get_prop_source_stmt (op1, true, NULL); + gimple *def_stmt2 + = get_prop_source_stmt (op1, true, NULL, &view_conv_prop_op1); if (!def_stmt2 || !can_propagate_from (def_stmt2)) return 0; @@ -2209,16 +2236,94 @@ simplify_permutation (gimple_stmt_iterator *gsi) return 0; arg1 = arg0; } - opt = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (op0), arg0, arg1, op2); + + /* If there are any VIEW_CONVERT_EXPRs found when finding permutation + operands source, check whether it's valid to transform and prepare + the required new operands. */ + if (view_conv_prop_op0 || view_conv_prop_op1) + { + if (TREE_CODE (op2) != VECTOR_CST) + return 0; + /* Either one should be CONSTRUCTOR (not VECTOR_CST), otherwise + it has been folded before. */ + gcc_assert (code == CONSTRUCTOR + || (!same_op && code2 == CONSTRUCTOR)); + /* Figure out the target vector type to which operands should be + converted. If both are CONSTRUCTOR, the types should be the + same, otherwise, use the one of CONSTRUCTOR. */ + tree tgt_type = NULL_TREE; + if (code == CONSTRUCTOR) + tgt_type = TREE_TYPE (arg0); + if (code2 == CONSTRUCTOR) + { + tree arg1_type = TREE_TYPE (arg1); + if (tgt_type == NULL_TREE) + tgt_type = arg1_type; + else if (tgt_type != arg1_type) + return 0; + } + + if (!VECTOR_TYPE_P (tgt_type)) + return 0; + tree op2_type = TREE_TYPE (op2); + /* Should have folded this before. */ + gcc_assert (op2_type != tgt_type); + + /* Figure out the shrinked factor. */ + poly_uint64 tgt_units = TYPE_VECTOR_SUBPARTS (tgt_type); + poly_uint64 op2_units = TYPE_VECTOR_SUBPARTS (op2_type); + if (maybe_gt (tgt_units, op2_units)) + return 0; + unsigned int factor; + if (!constant_multiple_p (op2_units, tgt_units, &factor)) + return 0; + + /* Build the new permutation control vector as target vector. */ + vec_perm_builder builder; + if (!tree_to_vec_perm_builder (&builder, op2)) + return 0; + vec_perm_indices indices (builder, 2, op2_units); + vec_perm_indices new_indices; + if (new_indices.new_shrinked_vector (indices, factor)) + { + tree mask_type = tgt_type; + if (!VECTOR_INTEGER_TYPE_P (mask_type)) + { + tree elem_type = TREE_TYPE (mask_type); + unsigned elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + tree int_type = build_nonstandard_integer_type (elem_size, 0); + mask_type = build_vector_type (int_type, tgt_units); + } + op2 = vec_perm_indices_to_tree (mask_type, new_indices); + } + else + return 0; + + /* Convert the VECTOR_CST to the appropriate vector type. */ + if (tgt_type != TREE_TYPE (arg0)) + arg0 = fold_build1 (VIEW_CONVERT_EXPR, tgt_type, arg0); + else if (tgt_type != TREE_TYPE (arg1)) + arg1 = fold_build1 (VIEW_CONVERT_EXPR, tgt_type, arg1); + } + tree res_type = TREE_TYPE (arg0); + opt = fold_ternary (VEC_PERM_EXPR, res_type, arg0, arg1, op2); if (!opt || (TREE_CODE (opt) != CONSTRUCTOR && TREE_CODE (opt) != VECTOR_CST)) return 0; + /* Found VIEW_CONVERT_EXPR before, need one explicit conversion. */ + if (res_type != TREE_TYPE (op0)) + { + tree name = make_ssa_name (TREE_TYPE (opt)); + gimple *ass_stmt = gimple_build_assign (name, opt); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + opt = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (op0), name); + } gimple_assign_set_rhs_from_tree (gsi, opt); update_stmt (gsi_stmt (*gsi)); if (TREE_CODE (op0) == SSA_NAME) - ret = remove_prop_source_from_use (op0); + ret = remove_prop_source_from_use (op0, true); if (op0 != op1 && TREE_CODE (op1) == SSA_NAME) - ret |= remove_prop_source_from_use (op1); + ret |= remove_prop_source_from_use (op1, true); return ret ? 2 : 1; } diff --git a/gcc/vec-perm-indices.c b/gcc/vec-perm-indices.c index ede590dc5c9..cf42e44f8a3 100644 --- a/gcc/vec-perm-indices.c +++ b/gcc/vec-perm-indices.c @@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const vec_perm_indices &orig, m_encoding.finalize (); } +/* Check whether we can switch to a new permutation vector that + selects the same input elements as ORIG, but with each element + built up from FACTOR pieces. Return true if yes, otherwise + return false. Every FACTOR permutation indexes should be + continuous separately and the first one of each batch should + be able to exactly modulo FACTOR. For example, if ORIG is + { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation + is { 1, 2, 0, 3 }. */ + +bool +vec_perm_indices::new_shrinked_vector (const vec_perm_indices &orig, + unsigned int factor) +{ + gcc_assert (factor > 0); + + if (maybe_lt (orig.m_nelts_per_input, factor)) + return false; + + poly_uint64 nelts; + /* Invalid if vector units number isn't multiple of factor. */ + if (!multiple_p (orig.m_nelts_per_input, factor, &nelts)) + return false; + + /* Only handle the case that npatterns is multiple of factor. + FIXME: Try to see whether we can reshape it by factor npatterns. */ + if (orig.m_encoding.npatterns () % factor != 0) + return false; + + unsigned int encoded_nelts = orig.m_encoding.encoded_nelts (); + auto_vec<element_type> encodings (encoded_nelts); + /* Separate all encoded elements into batches by size factor, + then ensure the first element of each batch is multiple of + factor and all elements in each batch is consecutive from + the first one. */ + for (unsigned int i = 0; i < encoded_nelts; i += factor) + { + element_type first = orig.m_encoding[i]; + element_type new_index; + if (!multiple_p (first, factor, &new_index)) + return false; + for (unsigned int j = 1; j < factor; ++j) + { + if (maybe_ne (first + j, orig.m_encoding[i + j])) + return false; + } + encodings.quick_push (new_index); + } + + m_ninputs = orig.m_ninputs; + m_nelts_per_input = nelts; + poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), factor); + unsigned int npatterns = orig.m_encoding.npatterns () / factor; + + m_encoding.new_vector (full_nelts, npatterns, + orig.m_encoding.nelts_per_pattern ()); + + for (unsigned int i = 0; i < encodings.length (); i++) + m_encoding.quick_push (encodings[i]); + + m_encoding.finalize (); + + return true; +} + /* Rotate the inputs of the permutation right by DELTA inputs. This changes the values of the permutation vector but it doesn't change the way that the elements are encoded. */ diff --git a/gcc/vec-perm-indices.h b/gcc/vec-perm-indices.h index bc70ecd8a1d..acd30a974ad 100644 --- a/gcc/vec-perm-indices.h +++ b/gcc/vec-perm-indices.h @@ -57,6 +57,7 @@ public: void new_vector (const vec_perm_builder &, unsigned int, poly_uint64); void new_expanded_vector (const vec_perm_indices &, unsigned int); + bool new_shrinked_vector (const vec_perm_indices &, unsigned int); void rotate_inputs (int delta); /* Return the underlying vector encoding. */

forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

Commit Message

Comments

Patch