@@ -842,13 +842,45 @@ public:
for (unsigned int i = 0; i < nargs; ++i)
{
tree elt = gimple_call_arg (f.call, i);
- if (!CONSTANT_CLASS_P (elt))
- return NULL;
builder.quick_push (elt);
for (unsigned int j = 1; j < factor; ++j)
builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
}
- return gimple_build_assign (f.lhs, builder.build ());
+ builder.finalize ();
+ unsigned int n_elts
+ = builder.nelts_per_pattern () == 1 ? builder.npatterns ()
+ : builder.full_nelts ().coeffs[0];
+
+ if (n_elts == 1)
+ return gimple_build_assign (f.lhs, build1 (VEC_DUPLICATE_EXPR, vec_type,
+ builder.elt (0)));
+ tree list = NULL_TREE;
+ tree *pp = &list;
+ for (unsigned int i = 0; i < n_elts; ++i)
+ {
+ *pp = build_tree_list (NULL, builder.elt (i) PASS_MEM_STAT);
+ pp = &TREE_CHAIN (*pp);
+ }
+
+ poly_uint64 vec_len = TYPE_VECTOR_SUBPARTS (vec_type);
+ vec_perm_builder sel (vec_len, n_elts, 1);
+ for (unsigned int i = 0; i < n_elts; i++)
+ sel.quick_push (i);
+ vec_perm_indices indices (sel, 1, n_elts);
+
+ tree elt_type = TREE_TYPE (vec_type);
+
+ tree ctor_type = build_vector_type (elt_type, n_elts);
+ tree ctor = make_ssa_name_fn (cfun, ctor_type, 0);
+ gimple *ctor_stmt
+ = gimple_build_assign (ctor,
+ build_constructor_from_list (ctor_type, list));
+ gsi_insert_before (f.gsi, ctor_stmt, GSI_SAME_STMT);
+
+ tree mask_type = build_vector_type (ssizetype, vec_len);
+ tree mask = vec_perm_indices_to_tree (mask_type, indices);
+ return gimple_build_assign (f.lhs, fold_build3 (VEC_PERM_EXPR, vec_type,
+ ctor, ctor, mask));
}
rtx
@@ -2544,6 +2544,17 @@
}
)
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "*aarch64_vec_duplicate_reg<mode>_le"
+ [(set (match_operand:SVE_FULL 0 "register_operand" "=w,w")
+ (vec_duplicate:SVE_FULL
+ (match_operand:<VEL> 1 "register_operand" "w,r")))]
+ "TARGET_SVE && !BYTES_BIG_ENDIAN"
+ "@
+ mov\t%0.<Vetype>, %<vwcore>1
+ mov\t%0.<Vetype>, %<Vetype>1"
+)
+
;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
;; The SVE register layout puts memory lane N into (architectural)
;; register lane N, whereas the Advanced SIMD layout puts the memory
@@ -6033,7 +6033,6 @@ rtx
aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
{
machine_mode src_mode = GET_MODE (src);
- gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
insn_code icode = (BYTES_BIG_ENDIAN
? code_for_aarch64_vec_duplicate_vq_be (mode)
: code_for_aarch64_vec_duplicate_vq_le (mode));
@@ -21806,20 +21805,29 @@ aarch64_simd_make_constant (rtx vals)
}
static void
-aarch64_vec_duplicate (rtx target, machine_mode mode, machine_mode element_mode,
+aarch64_vec_duplicate (rtx target, rtx op, machine_mode mode, machine_mode element_mode,
int narrow_n_elts)
{
poly_uint64 size = narrow_n_elts * GET_MODE_BITSIZE (element_mode);
- scalar_mode i_mode = int_mode_for_size (size, 0).require ();
machine_mode o_mode;
- if (aarch64_sve_mode_p (mode))
- o_mode = aarch64_full_sve_mode (i_mode).require ();
+ rtx input, output;
+ bool sve = aarch64_sve_mode_p (mode);
+ if (sve && known_eq (size, 128U))
+ {
+ o_mode = mode;
+ output = target;
+ input = op;
+ }
else
- o_mode
- = aarch64_simd_container_mode (i_mode,
- GET_MODE_BITSIZE (mode));
- rtx input = simplify_gen_subreg (i_mode, target, mode, 0);
- rtx output = simplify_gen_subreg (o_mode, target, mode, 0);
+ {
+ scalar_mode i_mode = int_mode_for_size (size, 0).require ();
+ o_mode
+ = sve ? aarch64_full_sve_mode (i_mode).require ()
+ : aarch64_simd_container_mode (i_mode,
+ GET_MODE_BITSIZE (mode));
+ input = simplify_gen_subreg (i_mode, op, GET_MODE (op), 0);
+ output = simplify_gen_subreg (o_mode, target, mode, 0);
+ }
aarch64_emit_move (output, gen_vec_duplicate (o_mode, input));
}
@@ -21910,6 +21918,16 @@ aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
return;
}
+ /* We are constructing a VLS vector that we may later duplicate into a VLA
+ one. Actually maybe split this into one for ASIMD and one for SVE? */
+ machine_mode real_mode = mode;
+ rtx real_target = target;
+ if (aarch64_sve_mode_p (real_mode))
+ {
+ mode = aarch64_vq_mode (GET_MODE_INNER (real_mode)).require ();
+ target = simplify_gen_subreg (mode, target, real_mode, 0);
+ }
+
enum insn_code icode = optab_handler (vec_set_optab, mode);
gcc_assert (icode != CODE_FOR_nothing);
@@ -22000,8 +22018,8 @@ aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
x = copy_to_mode_reg (inner_mode, x);
emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
}
- if (!known_eq (v.full_nelts (), n_elts))
- aarch64_vec_duplicate (target, mode, GET_MODE (v0), n_elts);
+ if (!known_eq (v.full_nelts (), n_elts))
+ aarch64_vec_duplicate (real_target, target, real_mode, GET_MODE (v0), n_elts);
return;
}
@@ -22048,7 +22066,7 @@ aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
}
if (!known_eq (v.full_nelts (), n_elts))
- aarch64_vec_duplicate (target, mode, inner_mode, n_elts);
+ aarch64_vec_duplicate (real_target, target, real_mode, inner_mode, n_elts);
}
/* Emit RTL corresponding to:
@@ -23947,11 +23965,7 @@ aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
if (BYTES_BIG_ENDIAN
|| !d->one_vector_p
|| d->vec_flags != VEC_SVE_DATA
- || d->op_vec_flags != VEC_ADVSIMD
- || d->perm.encoding ().nelts_per_pattern () != 1
- || !known_eq (d->perm.encoding ().npatterns (),
- GET_MODE_NUNITS (d->op_mode))
- || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+ || d->perm.encoding ().nelts_per_pattern () != 1)
return false;
int npatterns = d->perm.encoding ().npatterns ();
@@ -23962,7 +23976,10 @@ aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
if (d->testing_p)
return true;
- aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+ machine_mode mode = GET_MODE (d->target);
+ machine_mode element_mode = GET_MODE_INNER (mode);
+ aarch64_vec_duplicate (d->target, d->op0, mode, element_mode,
+ d->perm.encoding ().npatterns ());
return true;
}
@@ -24194,6 +24211,15 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
return ret;
}
+/* Implement TARGET_VECTORIZE_VLA_CONSTRUCTOR. */
+
+static bool
+aarch64_vectorize_vla_constructor (rtx target, rtx_vector_builder &builder)
+{
+ aarch64_expand_vector_init (target, builder);
+ return true;
+}
+
/* Generate a byte permute mask for a register of mode MODE,
which has NUNITS units. */
@@ -27667,6 +27693,10 @@ aarch64_libgcc_floating_mode_supported_p
#define TARGET_VECTORIZE_VEC_PERM_CONST \
aarch64_vectorize_vec_perm_const
+#undef TARGET_VECTORIZE_VLA_CONSTRUCTOR
+#define TARGET_VECTORIZE_VLA_CONSTRUCTOR \
+ aarch64_vectorize_vla_constructor
+
#undef TARGET_VECTORIZE_RELATED_MODE
#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
#undef TARGET_VECTORIZE_GET_MASK_MODE
@@ -6112,6 +6112,11 @@ instruction pattern. There is no need for the hook to handle these two
implementation approaches itself.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_VLA_CONSTRUCTOR (rtx @var{target}, rtx_vector_builder @var{&builder})
+This hook is used to expand a vla constructor into @var{target}
+using the rtx_vector_builder @var{builder}.
+@end deftypefn
+
@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
This hook should return the decl of a function that implements the
vectorized variant of the function with the @code{combined_fn} code
@@ -4164,6 +4164,8 @@ address; but often a machine-dependent strategy can generate better code.
@hook TARGET_VECTORIZE_VEC_PERM_CONST
+@hook TARGET_VECTORIZE_VLA_CONSTRUCTOR
+
@hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
@hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
@@ -10264,6 +10264,44 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
case VEC_PERM_EXPR:
{
+ if (TREE_CODE (treeop2) == VECTOR_CST
+ && targetm.vectorize.vla_constructor)
+ {
+ tree ctor0, ctor1;
+ if (TREE_CODE (treeop0) == SSA_NAME
+ && is_gimple_assign (SSA_NAME_DEF_STMT (treeop0)))
+ ctor0 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop0));
+ else
+ ctor0 = treeop0;
+ if (TREE_CODE (treeop1) == SSA_NAME
+ && is_gimple_assign (SSA_NAME_DEF_STMT (treeop1)))
+ ctor1 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop1));
+ else
+ ctor1 = treeop1;
+
+ if (TREE_CODE (ctor0) == CONSTRUCTOR
+ && TREE_CODE (ctor1) == CONSTRUCTOR)
+ {
+
+ unsigned int nelts = vector_cst_encoded_nelts (treeop2);
+ unsigned int ctor_nelts = CONSTRUCTOR_NELTS (ctor0);
+ machine_mode mode = GET_MODE (target);
+ rtx_vector_builder builder (mode, nelts, 1);
+ for (unsigned int i = 0; i < nelts; ++i)
+ {
+ unsigned HOST_WIDE_INT index
+ = tree_to_uhwi (VECTOR_CST_ENCODED_ELT (treeop2, i));
+ tree op
+ = index >= ctor_nelts
+ ? CONSTRUCTOR_ELT (ctor1, index - ctor_nelts)->value
+ : CONSTRUCTOR_ELT (ctor0, index)->value;
+ builder.quick_push (expand_normal (op));
+ }
+ builder.finalize ();
+ if (targetm.vectorize.vla_constructor (target, builder))
+ return target;
+ }
+ }
expand_operands (treeop0, treeop1, target, &op0, &op1, EXPAND_NORMAL);
vec_perm_builder sel;
if (TREE_CODE (treeop2) == VECTOR_CST
@@ -1902,6 +1902,13 @@ implementation approaches itself.",
const vec_perm_indices &sel),
NULL)
+DEFHOOK
+(vla_constructor,
+ "This hook is used to expand a vla constructor into @var{target}\n\
+using the rtx_vector_builder @var{builder}.",
+ bool, (rtx target, rtx_vector_builder &builder),
+ NULL)
+
/* Return true if the target supports misaligned store/load of a
specific factor denoted in the third parameter. The last parameter
is true if the access is defined in a packed struct. */
@@ -262,6 +262,8 @@ enum poly_value_estimate_kind
extern bool verify_type_context (location_t, type_context_kind, const_tree,
bool = false);
+class rtx_vector_builder;
+
/* The target structure. This holds all the backend hooks. */
#define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
#define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
new file mode 100644
@@ -0,0 +1,134 @@
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** test0:
+** ins v0.s\[1\], v1.s\[0\]
+** mov z0.d, d0
+** ret
+*/
+svfloat32_t test0(float x, float y) {
+ return svdupq_n_f32(x, y, x, y);
+}
+/*
+** test1:
+** mov z0.s, s0
+** ret
+*/
+
+svfloat32_t test1(float x) {
+ return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** test2:
+** mov z0.s, w0
+** ret
+*/
+
+svint32_t test2(int x) {
+ return svdupq_n_s32(x, x, x, x);
+}
+
+/*
+** test3:
+** sxth w0, w0
+** fmov d0, x0
+** ins v0.h\[1\], w1
+** ins v0.h\[2\], w2
+** ins v0.h\[3\], w3
+** mov z0.d, d0
+** ret
+*/
+
+svint16_t test3(short a, short b, short c, short d)
+{
+ return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** test4:
+** dup v0.4h, w0
+** ins v0.h\[1\], w1
+** ins v0.h\[3\], w1
+** mov z0.d, d0
+** ret
+*/
+
+svint16_t test4(short a, short b)
+{
+ return svdupq_n_s16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** test5:
+** mov z0.h, w0
+** ret
+*/
+
+svint16_t test5(short a)
+{
+ return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** test6:
+** sxtb w0, w0
+** fmov d0, x0
+** ins v0.b\[1\], w1
+** ins v0.b\[2\], w2
+** ins v0.b\[3\], w3
+** ins v0.b\[4\], w4
+** ins v0.b\[5\], w5
+** ins v0.b\[6\], w6
+** ins v0.b\[7\], w7
+** mov z0.d, d0
+** ret
+*/
+
+svint8_t test6(char a, char b, char c, char d, char e, char f, char g, char h)
+{
+ return svdupq_n_s8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** test7:
+** dup v0.8b, w0
+** ins v0.b\[1\], w1
+** ins v0.b\[2\], w2
+** ins v0.b\[3\], w3
+** mov z0.s, s0
+** ret
+*/
+
+svint8_t test7(char a, char b, char c, char d)
+{
+ return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+
+// We can do better than this
+/*
+** sxtb w0, w0
+** fmov d0, x0
+** ins v0.d\[1\], x1
+** ins v0.b\[1\], w1
+** mov z0.h, h0
+** ret
+*/
+
+svint8_t test8(char a, char b)
+{
+ return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** test9:
+** mov z0.b, w0
+** ret
+*/
+
+svint8_t test9(char a)
+{
+ return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}
@@ -1513,6 +1513,11 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
if (!TYPE_VECTOR_SUBPARTS (vect_type).is_constant (&elements))
return;
+ /* It is possible to have a VEC_PERM_EXPR with a VLA mask and a VLS
+ CONSTRUCTOR, this should return a VLA type, so we can't lower it. */
+ if (!TYPE_VECTOR_SUBPARTS (mask_type).is_constant ())
+ return;
+
if (TREE_CODE (mask) == SSA_NAME)
{
gimple *def_stmt = SSA_NAME_DEF_STMT (mask);