new file mode 100644
@@ -0,0 +1,124 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-O2 -mvsx" } */
+
+#include "vec-perm-ctor.h"
+
+#include <stdlib.h>
+
+int
+main ()
+{
+ du a_du = 100ULL;
+ du b_du = 200ULL;
+
+ di a_di = -100;
+ di b_di = 200;
+
+ df a_df = 10.0;
+ df b_df = 20.0;
+
+ si a_si = 12;
+ si b_si = -25;
+ si c_si = -37;
+ si d_si = 50;
+
+ sf a_sf = 30.0f;
+ sf b_sf = 40.0f;
+ sf c_sf = 50.0f;
+ sf d_sf = 60.0f;
+
+ hu a_hu = 10;
+ hu b_hu = 20;
+ hu c_hu = 30;
+ hu d_hu = 40;
+ hu e_hu = 50;
+ hu f_hu = 60;
+ hu g_hu = 70;
+ hu h_hu = 80;
+
+ qi a_qi = 10;
+ qi b_qi = 20;
+ qi c_qi = -30;
+ qi d_qi = 40;
+ qi e_qi = -50;
+ qi f_qi = 60;
+ qi g_qi = 70;
+ qi h_qi = -80;
+
+ v2du res1 = test_ctor_ctor_same_du (a_du, b_du);
+ if (res1[0] != a_du || res1[1] != b_du)
+ abort ();
+
+ v2df res2 = test_ctor_ctor_same_df (a_df, b_df);
+ if (res2[0] != a_df || res2[1] != b_df)
+ abort ();
+
+ v4si res3 = test_ctor_ctor_same_si (a_si, b_si, c_si, d_si);
+ if (res3[0] != a_si || res3[1] != b_si || res3[2] != c_si || res3[3] != d_si)
+ abort ();
+
+ v4sf res4 = test_ctor_ctor_same_sf (a_sf, b_sf, c_sf, d_sf);
+ if (res4[0] != a_sf || res4[1] != b_sf || res4[2] != c_sf || res4[3] != d_sf)
+ abort ();
+
+ v8hu res5
+ = test_ctor_ctor_same_hu (a_hu, b_hu, c_hu, d_hu, e_hu, f_hu, g_hu, h_hu);
+
+ if (res5[0] != a_hu || res5[1] != b_hu || res5[2] != c_hu || res5[3] != d_hu
+ || res5[4] != e_hu || res5[5] != f_hu || res5[6] != g_hu
+ || res5[7] != h_hu)
+ abort ();
+
+ v16qi res6
+ = test_ctor_ctor_same_qi (a_qi, b_qi, c_qi, d_qi, e_qi, f_qi, g_qi, h_qi);
+
+ if (res6[0] != a_qi || res6[1] != b_qi || res6[2] != c_qi || res6[3] != d_qi
+ || res6[4] != a_qi || res6[5] != b_qi || res6[6] != c_qi
+ || res6[7] != d_qi || res6[8] != e_qi || res6[9] != f_qi
+ || res6[10] != g_qi || res6[11] != h_qi || res6[12] != e_qi
+ || res6[13] != f_qi || res6[14] != g_qi || res6[15] != h_qi)
+ abort ();
+
+ v2du res7 = test_ctor_cst_same_du (a_du, b_du);
+ if (res7[0] != a_du || res7[1] != 100)
+ abort ();
+
+ v4sf res8 = test_ctor_cst_same_sf (a_sf, b_sf);
+ if (res8[0] != a_sf || res8[1] != 2.0f || res8[2] != b_sf || res8[3] != 4.0f)
+ abort ();
+
+ v2df res9 = test_ctor_cst_same_df (a_df, b_df);
+ if (res9[0] != b_df || res9[1] != 200.0)
+ abort ();
+
+ v4si res10 = test_cst_ctor_same_si (a_si, b_si);
+ if (res10[0] != 1 || res10[1] != 3 || res10[2] != a_si || res10[3] != b_si)
+ abort ();
+
+ v2di res11 = test_ctor_cst_diff_di_si (a_di, b_di);
+ /* Need to take care of the endianness since the function converts vector
+ const to one different vector type (element size), the endianness
+ determines the reinterpreted layout. Same reason for res12 below. */
+ if (res11[0] != -100 ||
+#ifdef __LITTLE_ENDIAN__
+ res11[1] != 3
+#else
+ res11[1] != 0x300000000LL
+#endif
+ )
+ abort ();
+
+ v2du res12 = test_cst_ctor_diff_sf_du (a_du, b_du);
+ if (
+#ifdef __LITTLE_ENDIAN__
+ res12[0] != 0x400000003f800000ULL
+#else
+ res12[0] != 0x3f80000040000000ULL
+#endif
+ || res12[1] != 100)
+ abort ();
+
+ return 0;
+}
+
new file mode 100644
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx -fdump-tree-optimized" } */
+
+/* To test all permutations fed by CTOR and CST can be optimized away. */
+
+#include "vec-perm-ctor.h"
+
+/* { dg-final { scan-tree-dump-not "VIEW_CONVERT_EXPR" "optimized" } } */
new file mode 100644
@@ -0,0 +1,163 @@
+#include "altivec.h"
+
+typedef vector unsigned long long v2du;
+typedef vector signed long long v2di;
+typedef vector unsigned int v4su;
+typedef vector signed int v4si;
+typedef vector unsigned short v8hu;
+typedef vector signed short v8hi;
+typedef vector unsigned char v16qu;
+typedef vector signed char v16qi;
+typedef vector double v2df;
+typedef vector float v4sf;
+
+typedef unsigned long long du;
+typedef signed long long di;
+typedef unsigned int su;
+typedef signed int si;
+typedef unsigned short hu;
+typedef signed short hi;
+typedef unsigned char qu;
+typedef signed char qi;
+typedef double df;
+typedef float sf;
+
+/* To test whether we can optimize vector permutation away when
+ the two inputs are same type CTOR or one input is CTOR and the
+ other is CST. */
+
+/* CTOR + CTOR part (only same type supported). */
+
+/* Test both operands are same type CTOR (type unsigned long long). */
+__attribute__ ((noipa)) v2du
+test_ctor_ctor_same_du (du a, du b)
+{
+ v2du v1 = {a, 0};
+ v2du v2 = {b, 0};
+ v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+ v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* Test both operands are same type CTOR (type double). */
+__attribute__ ((noipa)) v2df
+test_ctor_ctor_same_df (df a, df b)
+{
+ v2df v1 = {0.0, a};
+ v2df v2 = {0.0, b};
+ v16qu vc = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+ v2df vres = (v2df) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* Test both operands are same type CTOR (type signed int). */
+__attribute__ ((noipa)) v4si
+test_ctor_ctor_same_si (si a, si b, si c, si d)
+{
+ v4si v1 = {0, a, 0, c};
+ v4si v2 = {0, b, 0, d};
+ v16qu vc = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+ v4si vres = (v4si) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* Test both operands are same type CTOR (type float). */
+__attribute__ ((noipa)) v4sf
+test_ctor_ctor_same_sf (sf a, sf b, sf c, sf d)
+{
+ v4sf v1 = {c, 0.0f, d, 0.0f};
+ v4sf v2 = {a, 0.0f, b, 0.0f};
+ v16qu vc = {16, 17, 18, 19, 24, 25, 26, 27, 0, 1, 2, 3, 8, 9, 10, 11};
+ v4sf vres = (v4sf) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* Test both operands are same type CTOR (type unsigned short). */
+__attribute__ ((noipa)) v8hu
+test_ctor_ctor_same_hu (hu a, hu b, hu c, hu d, hu e, hu f, hu g, hu h)
+{
+ v8hu v1 = {0, a, 0, b, 0, c, 0, d};
+ v8hu v2 = {0, e, 0, f, 0, g, 0, h};
+ v16qu vc = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
+ v8hu vres = (v8hu) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* Test both operands are same type CTOR (type signed char). */
+__attribute__ ((noipa)) v16qi
+test_ctor_ctor_same_qi (qi a, qi b, qi c, qi d, qi e, qi f, qi g, qi h)
+{
+ v16qi v1 = {0, a, 0, b, 0, c, 0, d, 0, a, 0, b, 0, c, 0, d};
+ v16qi v2 = {0, e, 0, f, 0, g, 0, h, 0, e, 0, f, 0, g, 0, h};
+ v16qu vc = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
+ v16qi vres = (v16qi) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* CTOR + CST part (same type). */
+
+__attribute__ ((noipa)) v2du
+test_ctor_cst_same_du (du a, du b)
+{
+ v2du v1 = {a, b};
+ v2du v2 = {100, 200};
+ v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+ v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+__attribute__ ((noipa)) v4sf
+test_ctor_cst_same_sf (sf a, sf b)
+{
+ v4sf v1 = {0.0f, a, 0.0f, b};
+ v4sf v2 = {1.0f, 2.0f, 3.0f, 4.0f};
+ v16qu vc = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+ v4sf vres = (v4sf) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* CST + CTOR part (same type). */
+
+__attribute__ ((noipa)) v2df
+test_ctor_cst_same_df (df a, df b)
+{
+ v2df v1 = {a, b};
+ v2df v2 = {100.0, 200.0};
+ v16qu vc = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+ v2df vres = (v2df) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+__attribute__ ((noipa)) v4si
+test_cst_ctor_same_si (si a, si b)
+{
+ v4si v1 = {a, 0, b, 0};
+ v4si v2 = {1, 2, 3, 4};
+ v16qu vc = {16, 17, 18, 19, 24, 25, 26, 27, 0, 1, 2, 3, 8, 9, 10, 11};
+ v4si vres = (v4si) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* CTOR + CST part (different types). */
+
+__attribute__ ((noipa)) v2di
+test_ctor_cst_diff_di_si (di a, di b)
+{
+ v2di v1 = {a, b};
+ v4si v2 = {3, 0, 4, 0};
+ v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+ v2di vres = (v2di) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
+
+/* CST + CTOR part (different types). */
+
+__attribute__ ((noipa)) v2du
+test_cst_ctor_diff_sf_du (du a, du b)
+{
+ v4sf v1 = {1.0f, 2.0f, 3.0f, 4.0f};
+ v2du v2 = {a, b};
+ v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+ v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+ return vres;
+}
@@ -2120,9 +2120,9 @@ static int
simplify_permutation (gimple_stmt_iterator *gsi)
{
gimple *stmt = gsi_stmt (*gsi);
- gimple *def_stmt;
+ gimple *def_stmt = NULL;
tree op0, op1, op2, op3, arg0, arg1;
- enum tree_code code;
+ enum tree_code code, code2 = ERROR_MARK;
bool single_use_op0 = false;
gcc_checking_assert (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR);
@@ -2142,10 +2142,28 @@ simplify_permutation (gimple_stmt_iterator *gsi)
else if (TREE_CODE (op0) == SSA_NAME)
{
def_stmt = get_prop_source_stmt (op0, false, &single_use_op0);
- if (!def_stmt || !can_propagate_from (def_stmt))
+ if (!def_stmt)
return 0;
-
code = gimple_assign_rhs_code (def_stmt);
+ if (code == VIEW_CONVERT_EXPR)
+ {
+ tree rhs = gimple_assign_rhs1 (def_stmt);
+ tree name = TREE_OPERAND (rhs, 0);
+ if (TREE_CODE (name) != SSA_NAME)
+ return 0;
+ if (!has_single_use (name))
+ single_use_op0 = false;
+ /* Here we update the def_stmt through this VIEW_CONVERT_EXPR,
+ but still keep the code to indicate it comes from
+ VIEW_CONVERT_EXPR. */
+ def_stmt = SSA_NAME_DEF_STMT (name);
+ if (!def_stmt || !is_gimple_assign (def_stmt))
+ return 0;
+ if (gimple_assign_rhs_code (def_stmt) != CONSTRUCTOR)
+ return 0;
+ }
+ if (!can_propagate_from (def_stmt))
+ return 0;
arg0 = gimple_assign_rhs1 (def_stmt);
}
else
@@ -2173,12 +2191,10 @@ simplify_permutation (gimple_stmt_iterator *gsi)
update_stmt (stmt);
return remove_prop_source_from_use (op0) ? 2 : 1;
}
-
- /* Shuffle of a constructor. */
- else if (code == CONSTRUCTOR || code == VECTOR_CST)
+ else if (code == CONSTRUCTOR
+ || code == VECTOR_CST
+ || code == VIEW_CONVERT_EXPR)
{
- tree opt;
- bool ret = false;
if (op0 != op1)
{
if (TREE_CODE (op0) == SSA_NAME && !single_use_op0)
@@ -2188,14 +2204,27 @@ simplify_permutation (gimple_stmt_iterator *gsi)
arg1 = op1;
else if (TREE_CODE (op1) == SSA_NAME)
{
- enum tree_code code2;
-
gimple *def_stmt2 = get_prop_source_stmt (op1, true, NULL);
- if (!def_stmt2 || !can_propagate_from (def_stmt2))
+ if (!def_stmt2)
return 0;
-
code2 = gimple_assign_rhs_code (def_stmt2);
- if (code2 != CONSTRUCTOR && code2 != VECTOR_CST)
+ if (code2 == VIEW_CONVERT_EXPR)
+ {
+ tree rhs = gimple_assign_rhs1 (def_stmt2);
+ tree name = TREE_OPERAND (rhs, 0);
+ if (TREE_CODE (name) != SSA_NAME)
+ return 0;
+ if (!has_single_use (name))
+ return 0;
+ def_stmt2 = SSA_NAME_DEF_STMT (name);
+ if (!def_stmt2 || !is_gimple_assign (def_stmt2))
+ return 0;
+ if (gimple_assign_rhs_code (def_stmt2) != CONSTRUCTOR)
+ return 0;
+ }
+ else if (code2 != CONSTRUCTOR && code2 != VECTOR_CST)
+ return 0;
+ if (!can_propagate_from (def_stmt2))
return 0;
arg1 = gimple_assign_rhs1 (def_stmt2);
}
@@ -2209,10 +2238,92 @@ simplify_permutation (gimple_stmt_iterator *gsi)
return 0;
arg1 = arg0;
}
- opt = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (op0), arg0, arg1, op2);
+
+ /* If there are any VIEW_CONVERT_EXPRs found when finding permutation
+ operands source, check whether it's valid to transform and prepare
+ the required new operands. */
+ if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
+ {
+ /* Figure out the target vector type to which operands should be
+ converted. If both are CONSTRUCTOR, the types should be the
+ same, otherwise, use the one of CONSTRUCTOR. */
+ tree tgt_type = NULL_TREE;
+ if (code == VIEW_CONVERT_EXPR)
+ {
+ gcc_assert (gimple_assign_rhs_code (def_stmt) == CONSTRUCTOR);
+ code = CONSTRUCTOR;
+ tgt_type = TREE_TYPE (arg0);
+ }
+ if (code2 == VIEW_CONVERT_EXPR)
+ {
+ tree arg1_type = TREE_TYPE (arg1);
+ if (tgt_type == NULL_TREE)
+ tgt_type = arg1_type;
+ else if (tgt_type != arg1_type)
+ return 0;
+ }
+
+ if (!VECTOR_TYPE_P (tgt_type))
+ return 0;
+ tree op2_type = TREE_TYPE (op2);
+ /* Should have folded this before. */
+ gcc_assert (op2_type != tgt_type);
+
+ /* Figure out the shrunk factor. */
+ poly_uint64 tgt_units = TYPE_VECTOR_SUBPARTS (tgt_type);
+ poly_uint64 op2_units = TYPE_VECTOR_SUBPARTS (op2_type);
+ if (maybe_gt (tgt_units, op2_units))
+ return 0;
+ unsigned int factor;
+ if (!constant_multiple_p (op2_units, tgt_units, &factor))
+ return 0;
+
+ /* Build the new permutation control vector as target vector. */
+ vec_perm_builder builder;
+ if (!tree_to_vec_perm_builder (&builder, op2))
+ return 0;
+ vec_perm_indices indices (builder, 2, op2_units);
+ vec_perm_indices new_indices;
+ if (new_indices.new_shrunk_vector (indices, factor))
+ {
+ tree mask_type = tgt_type;
+ if (!VECTOR_INTEGER_TYPE_P (mask_type))
+ {
+ tree elem_type = TREE_TYPE (mask_type);
+ unsigned elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+ tree int_type = build_nonstandard_integer_type (elem_size, 0);
+ mask_type = build_vector_type (int_type, tgt_units);
+ }
+ op2 = vec_perm_indices_to_tree (mask_type, new_indices);
+ }
+ else
+ return 0;
+
+ /* Convert the VECTOR_CST to the appropriate vector type. */
+ if (tgt_type != TREE_TYPE (arg0))
+ arg0 = fold_build1 (VIEW_CONVERT_EXPR, tgt_type, arg0);
+ else if (tgt_type != TREE_TYPE (arg1))
+ arg1 = fold_build1 (VIEW_CONVERT_EXPR, tgt_type, arg1);
+ }
+
+ /* VIEW_CONVERT_EXPR should be updated to CONSTRUCTOR before. */
+ gcc_assert (code == CONSTRUCTOR || code == VECTOR_CST);
+
+ /* Shuffle of a constructor. */
+ bool ret = false;
+ tree res_type = TREE_TYPE (arg0);
+ tree opt = fold_ternary (VEC_PERM_EXPR, res_type, arg0, arg1, op2);
if (!opt
|| (TREE_CODE (opt) != CONSTRUCTOR && TREE_CODE (opt) != VECTOR_CST))
return 0;
+ /* Found VIEW_CONVERT_EXPR before, need one explicit conversion. */
+ if (res_type != TREE_TYPE (op0))
+ {
+ tree name = make_ssa_name (TREE_TYPE (opt));
+ gimple *ass_stmt = gimple_build_assign (name, opt);
+ gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+ opt = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (op0), name);
+ }
gimple_assign_set_rhs_from_tree (gsi, opt);
update_stmt (gsi_stmt (*gsi));
if (TREE_CODE (op0) == SSA_NAME)
@@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const vec_perm_indices &orig,
m_encoding.finalize ();
}
+/* Check whether we can switch to a new permutation vector that
+ selects the same input elements as ORIG, but with each element
+ built up from FACTOR pieces. Return true if yes, otherwise
+ return false. Every FACTOR permutation indexes should be
+ continuous separately and the first one of each batch should
+ be able to exactly modulo FACTOR. For example, if ORIG is
+ { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation
+ is { 1, 2, 0, 3 }. */
+
+bool
+vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig,
+ unsigned int factor)
+{
+ gcc_assert (factor > 0);
+
+ if (maybe_lt (orig.m_nelts_per_input, factor))
+ return false;
+
+ poly_uint64 nelts;
+ /* Invalid if vector units number isn't multiple of factor. */
+ if (!multiple_p (orig.m_nelts_per_input, factor, &nelts))
+ return false;
+
+ /* Only handle the case that npatterns is multiple of factor.
+ FIXME: Try to see whether we can reshape it by factor npatterns. */
+ if (orig.m_encoding.npatterns () % factor != 0)
+ return false;
+
+ unsigned int encoded_nelts = orig.m_encoding.encoded_nelts ();
+ auto_vec<element_type> encodings (encoded_nelts);
+ /* Separate all encoded elements into batches by size factor,
+ then ensure the first element of each batch is multiple of
+ factor and all elements in each batch is consecutive from
+ the first one. */
+ for (unsigned int i = 0; i < encoded_nelts; i += factor)
+ {
+ element_type first = orig.m_encoding[i];
+ element_type new_index;
+ if (!multiple_p (first, factor, &new_index))
+ return false;
+ for (unsigned int j = 1; j < factor; ++j)
+ {
+ if (maybe_ne (first + j, orig.m_encoding[i + j]))
+ return false;
+ }
+ encodings.quick_push (new_index);
+ }
+
+ m_ninputs = orig.m_ninputs;
+ m_nelts_per_input = nelts;
+ poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), factor);
+ unsigned int npatterns = orig.m_encoding.npatterns () / factor;
+
+ m_encoding.new_vector (full_nelts, npatterns,
+ orig.m_encoding.nelts_per_pattern ());
+
+ for (unsigned int i = 0; i < encodings.length (); i++)
+ m_encoding.quick_push (encodings[i]);
+
+ m_encoding.finalize ();
+
+ return true;
+}
+
/* Rotate the inputs of the permutation right by DELTA inputs. This changes
the values of the permutation vector but it doesn't change the way that
the elements are encoded. */
@@ -57,6 +57,7 @@ public:
void new_vector (const vec_perm_builder &, unsigned int, poly_uint64);
void new_expanded_vector (const vec_perm_indices &, unsigned int);
+ bool new_shrunk_vector (const vec_perm_indices &, unsigned int);
void rotate_inputs (int delta);
/* Return the underlying vector encoding. */