@@ -194,7 +194,7 @@ ADV_SIMD_Q_REG_STRUCT_MODES (4, V4x16, V4x8, V4x4, V4x2)
stored in each 128-bit unit. The actual size of the mode depends
on command-line flags.
- VNx1TI isn't really a native SVE mode, but it can be useful in some
+ VNx1* aren't really native SVE modes, but they can be useful in some
limited situations. */
VECTOR_MODE_WITH_PREFIX (VNx, INT, TI, 1, 1);
SVE_MODES (1, VNx16, VNx8, VNx4, VNx2, VNx1)
@@ -204,9 +204,10 @@ SVE_MODES (4, VNx64, VNx32, VNx16, VNx8, VNx4)
/* Partial SVE vectors:
- VNx2QI VNx4QI VNx8QI
- VNx2HI VNx4HI
- VNx2SI
+ VNx2QI VNx4QI VNx8QI
+ VNx2HI VNx4HI
+ VNx1SI VNx2SI
+ VNx1DI
In memory they occupy contiguous locations, in the same way as fixed-length
vectors. E.g. VNx8QImode is half the size of VNx16QImode.
@@ -214,12 +215,17 @@ SVE_MODES (4, VNx64, VNx32, VNx16, VNx8, VNx4)
Passing 2 as the final argument ensures that the modes come after all
other single-vector modes in the GET_MODE_WIDER chain, so that we never
pick them in preference to a full vector mode. */
+VECTOR_MODE_WITH_PREFIX (VNx, INT, SI, 1, 2);
+VECTOR_MODE_WITH_PREFIX (VNx, INT, DI, 1, 2);
VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 2);
VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 2);
VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 2);
VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 4, 2);
VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 8, 2);
+ADJUST_NUNITS (VNx1SI, exact_div (aarch64_sve_vg, 2));
+ADJUST_NUNITS (VNx1DI, exact_div (aarch64_sve_vg, 2));
+
ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
@@ -245,9 +251,12 @@ ADJUST_ALIGNMENT (VNx2BF, 2);
ADJUST_ALIGNMENT (VNx4HF, 2);
ADJUST_ALIGNMENT (VNx4BF, 2);
+ADJUST_ALIGNMENT (VNx1SI, 4);
ADJUST_ALIGNMENT (VNx2SI, 4);
ADJUST_ALIGNMENT (VNx2SF, 4);
+ADJUST_ALIGNMENT (VNx1DI, 8);
+
/* Quad float: 128-bit floating mode for long doubles. */
FLOAT_MODE (TF, 16, ieee_quad_format);
@@ -956,7 +956,8 @@ public:
return e.use_exact_insn (code_for_aarch64_sve_dup_lane (mode));
/* Treat svdup_lane as if it were svtbl_n. */
- return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
+ return e.use_exact_insn (code_for_aarch64_sve (UNSPEC_TBL,
+ e.vector_mode (0)));
}
};
@@ -2897,16 +2898,6 @@ public:
}
};
-class svtbl_impl : public permute
-{
-public:
- rtx
- expand (function_expander &e) const override
- {
- return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
- }
-};
-
/* Implements svtrn1 and svtrn2. */
class svtrn_impl : public binary_permute
{
@@ -3432,7 +3423,8 @@ FUNCTION (svsub, svsub_impl,)
FUNCTION (svsubr, rtx_code_function_rotated, (MINUS, MINUS, UNSPEC_COND_FSUB))
FUNCTION (svsudot, svusdot_impl, (true))
FUNCTION (svsudot_lane, svdotprod_lane_impl, (UNSPEC_SUDOT, -1, -1))
-FUNCTION (svtbl, svtbl_impl,)
+FUNCTION (svtbl, quiet<unspec_based_uncond_function>, (UNSPEC_TBL, UNSPEC_TBL,
+ UNSPEC_TBL))
FUNCTION (svtmad, CODE_FOR_MODE0 (aarch64_sve_tmad),)
FUNCTION (svtrn1, svtrn_impl, (0))
FUNCTION (svtrn1q, unspec_based_function, (UNSPEC_TRN1Q, UNSPEC_TRN1Q,
@@ -600,7 +600,7 @@ public:
tree perm_type = build_vector_type (ssizetype, nelts);
return gimple_build_assign (f.lhs, VEC_PERM_EXPR,
gimple_call_arg (f.call, 0),
- gimple_call_arg (f.call, nargs - 1),
+ gimple_call_arg (f.call, nargs == 1 ? 0 : 1),
vec_perm_indices_to_tree (perm_type, indices));
}
};
@@ -735,7 +735,7 @@ struct binary_za_slice_opt_single_base : public overloaded_base<1>
}
};
-/* Base class for ext. */
+/* Base class for ext and extq. */
struct ext_base : public overloaded_base<0>
{
void
@@ -850,6 +850,22 @@ struct load_gather_sv_base : public overloaded_base<0>
}
};
+/* Base class for load_gather64_sv_index and load_gather64_sv_offset. */
+struct load_gather64_sv_base : public load_gather_sv_base
+{
+ type_suffix_index
+ vector_base_type (type_suffix_index) const override
+ {
+ return TYPE_SUFFIX_u64;
+ }
+
+ function_resolver::target_type_restrictions
+ get_target_type_restrictions (const function_instance &) const override
+ {
+ return function_resolver::TARGET_ANY;
+ }
+};
+
/* Base class for load_ext_gather_index and load_ext_gather_offset,
which differ only in the units of the displacement. */
struct load_ext_gather_base : public overloaded_base<1>
@@ -1033,6 +1049,22 @@ struct store_scatter_base : public overloaded_base<0>
}
};
+/* Base class for store_scatter64_index and store_scatter64_offset. */
+struct store_scatter64_base : public store_scatter_base
+{
+ type_suffix_index
+ vector_base_type (type_suffix_index) const override
+ {
+ return TYPE_SUFFIX_u64;
+ }
+
+ type_suffix_index
+ infer_vector_type (function_resolver &r, unsigned int argno) const override
+ {
+ return r.infer_vector_type (argno);
+ }
+};
+
/* Base class for ternary operations in which the final argument is an
immediate shift amount. The derived class should check the range. */
struct ternary_shift_imm_base : public overloaded_base<0>
@@ -2441,6 +2473,21 @@ struct ext_def : public ext_base
};
SHAPE (ext)
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, uint64_t)
+
+ where the final argument is an integer constant expression that when
+ multiplied by the number of bytes in t0 is in the range [0, 15]. */
+struct extq_def : public ext_base
+{
+ bool
+ check (function_checker &c) const override
+ {
+ unsigned int bytes = c.type_suffix (0).element_bytes;
+ return c.require_immediate_range (2, 0, 16 / bytes - 1);
+ }
+};
+SHAPE (extq)
+
/* svboolx<g>_t svfoo_t0_g(sv<t0>_t, sv<t0>_t, uint32_t). */
struct extract_pred_def : public nonoverloaded_base
{
@@ -2992,6 +3039,75 @@ struct load_gather_vs_def : public overloaded_base<1>
};
SHAPE (load_gather_vs)
+/* sv<t0>_t svfoo_[s64]index[_t0](const <t0>_t *, svint64_t)
+ sv<t0>_t svfoo_[u64]index[_t0](const <t0>_t *, svuint64_t). */
+struct load_gather64_sv_index_def : public load_gather64_sv_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_index);
+ build_all (b, "t0,al,d", group, MODE_s64index);
+ build_all (b, "t0,al,d", group, MODE_u64index);
+ }
+};
+SHAPE (load_gather64_sv_index)
+
+/* sv<t0>_t svfoo_[s64]offset[_t0](const <t0>_t *, svint64_t)
+ sv<t0>_t svfoo_[u64]offset[_t0](const <t0>_t *, svuint64_t). */
+struct load_gather64_sv_offset_def : public load_gather64_sv_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_offset);
+ build_all (b, "t0,al,d", group, MODE_s64offset);
+ build_all (b, "t0,al,d", group, MODE_u64offset);
+ }
+};
+SHAPE (load_gather64_sv_offset)
+
+/* sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t). */
+struct load_gather64_vs_index_def : public nonoverloaded_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ build_all (b, "t0,b,ss64", group, MODE_u64base_index, true);
+ }
+
+ tree
+ resolve (function_resolver &) const override
+ {
+ /* The short name just makes the base vector mode implicit;
+ no resolution is needed. */
+ gcc_unreachable ();
+ }
+};
+SHAPE (load_gather64_vs_index)
+
+/* sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
+
+ sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t). */
+struct load_gather64_vs_offset_def : public nonoverloaded_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ build_all (b, "t0,b", group, MODE_u64base, true);
+ build_all (b, "t0,b,ss64", group, MODE_u64base_offset, true);
+ }
+
+ tree
+ resolve (function_resolver &) const override
+ {
+ /* The short name just makes the base vector mode implicit;
+ no resolution is needed. */
+ gcc_unreachable ();
+ }
+};
+SHAPE (load_gather64_vs_offset)
+
/* sv<t0>_t svfoo[_t0](const <t0>_t *)
The only difference from "load" is that this shape has no vnum form. */
@@ -3044,6 +3160,92 @@ struct pattern_pred_def : public nonoverloaded_base
};
SHAPE (pattern_pred)
+/* svbool_t svfoo[_t0](sv<t0>_t). */
+struct pmov_from_vector_def : public overloaded_base<0>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "vp,v0", group, MODE_none);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ return r.resolve_uniform (1);
+ }
+};
+SHAPE (pmov_from_vector)
+
+/* svbool_t svfoo[_t0](sv<t0>_t, uint64_t)
+
+ where the final argument is an integer constant expression in the
+ range [0, sizeof (<t0>_t) - 1]. */
+struct pmov_from_vector_lane_def : public overloaded_base<0>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "vp,v0,su64", group, MODE_none);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ return r.resolve_uniform (1, 1);
+ }
+
+ bool
+ check (function_checker &c) const override
+ {
+ unsigned int bytes = c.type_suffix (0).element_bytes;
+ return c.require_immediate_range (1, 0, bytes - 1);
+ }
+};
+SHAPE (pmov_from_vector_lane)
+
+/* sv<t0>_t svfoo_t0(uint64_t)
+
+ where the final argument is an integer constant expression in the
+ range [1, sizeof (<t0>_t) - 1]. */
+struct pmov_to_vector_lane_def : public overloaded_base<0>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "v0,su64", group, MODE_none);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ type_suffix_index type;
+ gcc_assert (r.pred == PRED_m);
+ if (!r.check_num_arguments (3)
+ || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES
+ || !r.require_vector_type (1, VECTOR_TYPE_svbool_t)
+ || !r.require_integer_immediate (2))
+ return error_mark_node;
+
+ return r.resolve_to (r.mode_suffix_id, type);
+ }
+
+ bool
+ check (function_checker &c) const override
+ {
+ unsigned int bytes = c.type_suffix (0).element_bytes;
+ /* 1 to account for the vector argument.
+
+ ??? This should probably be folded into function_checker::m_base_arg,
+ but it doesn't currently have the necessary information. */
+ return c.require_immediate_range (1, 1, bytes - 1);
+ }
+};
+SHAPE (pmov_to_vector_lane)
+
/* void svfoo(const void *, svprfop)
void svfoo_vnum(const void *, int64_t, svprfop). */
struct prefetch_def : public nonoverloaded_base
@@ -3215,6 +3417,24 @@ struct reduction_def : public overloaded_base<0>
};
SHAPE (reduction)
+/* <t0>xN_t svfoo[_t0](sv<t0>_t). */
+struct reduction_neonq_def : public overloaded_base<0>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "Q0,v0", group, MODE_none);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ return r.resolve_uniform (1);
+ }
+};
+SHAPE (reduction_neonq)
+
/* int64_t svfoo[_t0](sv<t0>_t) (for signed t0)
uint64_t svfoo[_t0](sv<t0>_t) (for unsigned t0)
<t0>_t svfoo[_t0](sv<t0>_t) (for floating-point t0)
@@ -3612,6 +3832,44 @@ struct store_scatter_offset_restricted_def : public store_scatter_base
};
SHAPE (store_scatter_offset_restricted)
+/* void svfoo_[s64]index[_t0](<t0>_t *, svint64_t, sv<t0>_t)
+ void svfoo_[u64]index[_t0](<t0>_t *, svuint64_t, sv<t0>_t)
+
+ void svfoo[_u64base]_index[_t0](svuint64_t, int64_t, sv<t0>_t). */
+struct store_scatter64_index_def : public store_scatter64_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_index);
+ build_all (b, "_,as,d,t0", group, MODE_s64index);
+ build_all (b, "_,as,d,t0", group, MODE_u64index);
+ build_all (b, "_,b,ss64,t0", group, MODE_u64base_index);
+ }
+};
+SHAPE (store_scatter64_index)
+
+/* void svfoo_[s64]offset[_t0](<t0>_t *, svint64_t, sv<t0>_t)
+ void svfoo_[u64]offset[_t0](<t0>_t *, svuint64_t, sv<t0>_t)
+
+ void svfoo[_u64base_t0](svuint64_t, sv<t0>_t)
+
+ void svfoo[_u64base]_offset[_t0](svuint64_t, int64_t, sv<t0>_t). */
+struct store_scatter64_offset_def : public store_scatter64_base
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_none);
+ b.add_overloaded_functions (group, MODE_offset);
+ build_all (b, "_,as,d,t0", group, MODE_s64offset);
+ build_all (b, "_,as,d,t0", group, MODE_u64offset);
+ build_all (b, "_,b,t0", group, MODE_u64base);
+ build_all (b, "_,b,ss64,t0", group, MODE_u64base_offset);
+ }
+};
+SHAPE (store_scatter64_offset)
+
/* void svfoo_t0(uint64_t, uint32_t, svbool_t, void *)
void svfoo_vnum_t0(uint64_t, uint32_t, svbool_t, void *, int64_t)
@@ -4365,6 +4623,33 @@ struct unary_convertxn_def : public unary_convert_def
};
SHAPE (unary_convertxn)
+/* sv<t0>_t svfoo_<t0>(sv<t0>_t, uint64_t)
+
+ where the final argument is an integer constant expression in the
+ range [0, 16 / sizeof (<t0>_t) - 1]. */
+struct unary_lane_def : public overloaded_base<0>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "v0,v0,su64", group, MODE_none);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ return r.resolve_uniform (1, 1);
+ }
+
+ bool
+ check (function_checker &c) const override
+ {
+ return c.require_immediate_lane_index (1, 0);
+ }
+};
+SHAPE (unary_lane)
+
/* sv<t0>_t svfoo[_t0](sv<t0:half>_t). */
struct unary_long_def : public overloaded_base<0>
{
@@ -128,6 +128,7 @@ namespace aarch64_sve
extern const function_shape *const dupq;
extern const function_shape *const dup_neonq;
extern const function_shape *const ext;
+ extern const function_shape *const extq;
extern const function_shape *const extract_pred;
extern const function_shape *const fold_left;
extern const function_shape *const get;
@@ -152,12 +153,19 @@ namespace aarch64_sve
extern const function_shape *const load_gather_sv;
extern const function_shape *const load_gather_sv_restricted;
extern const function_shape *const load_gather_vs;
+ extern const function_shape *const load_gather64_sv_index;
+ extern const function_shape *const load_gather64_sv_offset;
+ extern const function_shape *const load_gather64_vs_index;
+ extern const function_shape *const load_gather64_vs_offset;
extern const function_shape *const load_replicate;
extern const function_shape *const load_za;
extern const function_shape *const luti2_lane_zt;
extern const function_shape *const luti4_lane_zt;
extern const function_shape *const mmla;
extern const function_shape *const pattern_pred;
+ extern const function_shape *const pmov_from_vector;
+ extern const function_shape *const pmov_from_vector_lane;
+ extern const function_shape *const pmov_to_vector_lane;
extern const function_shape *const prefetch;
extern const function_shape *const prefetch_gather_index;
extern const function_shape *const prefetch_gather_offset;
@@ -167,6 +175,7 @@ namespace aarch64_sve
extern const function_shape *const read_za_m;
extern const function_shape *const read_za_slice;
extern const function_shape *const reduction;
+ extern const function_shape *const reduction_neonq;
extern const function_shape *const reduction_wide;
extern const function_shape *const reinterpret;
extern const function_shape *const select_pred;
@@ -186,6 +195,8 @@ namespace aarch64_sve
extern const function_shape *const store_scatter_index_restricted;
extern const function_shape *const store_scatter_offset;
extern const function_shape *const store_scatter_offset_restricted;
+ extern const function_shape *const store_scatter64_index;
+ extern const function_shape *const store_scatter64_offset;
extern const function_shape *const store_za;
extern const function_shape *const storexn;
extern const function_shape *const str_za;
@@ -218,6 +229,7 @@ namespace aarch64_sve
extern const function_shape *const unary_convert;
extern const function_shape *const unary_convert_narrowt;
extern const function_shape *const unary_convertxn;
+ extern const function_shape *const unary_lane;
extern const function_shape *const unary_long;
extern const function_shape *const unary_n;
extern const function_shape *const unary_narrowb;
@@ -78,6 +78,44 @@ unspec_sqrdcmlah (int rot)
}
}
+class ld1uxq_st1xq_base : public function_base
+{
+public:
+ CONSTEXPR ld1uxq_st1xq_base (machine_mode memory_mode)
+ : m_memory_mode (memory_mode) {}
+
+ tree
+ memory_scalar_type (const function_instance &fi) const override
+ {
+ return fi.scalar_type (0);
+ }
+
+ machine_mode
+ memory_vector_mode (const function_instance &) const override
+ {
+ return m_memory_mode;
+ }
+
+protected:
+ machine_mode m_memory_mode;
+};
+
+class ld234q_st234q_base : public full_width_access
+{
+public:
+ CONSTEXPR ld234q_st234q_base (unsigned int vector_count, machine_mode mode)
+ : full_width_access (vector_count), m_mode (mode)
+ {}
+
+ machine_mode
+ memory_vector_mode (const function_instance &) const override
+ {
+ return m_mode;
+ }
+
+ machine_mode m_mode;
+};
+
class svaba_impl : public function_base
{
public:
@@ -183,6 +221,100 @@ public:
}
};
+class svdup_laneq_impl : public function_base
+{
+public:
+ rtx
+ expand (function_expander &e) const override
+ {
+ return e.use_exact_insn (code_for_aarch64_sve_dupq (e.result_mode ()));
+ }
+};
+
+class svextq_impl : public permute
+{
+public:
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ unsigned int index = tree_to_uhwi (gimple_call_arg (f.call, 2));
+ machine_mode mode = f.vector_mode (0);
+ unsigned int subelts = 128U / GET_MODE_UNIT_BITSIZE (mode);
+ poly_uint64 nelts = GET_MODE_NUNITS (mode);
+ vec_perm_builder builder (nelts, subelts, 3);
+ for (unsigned int i = 0; i < 3; ++i)
+ for (unsigned int j = 0; j < subelts; ++j)
+ {
+ if (index + j < subelts)
+ builder.quick_push (i * subelts + index + j);
+ else
+ builder.quick_push (i * subelts + index + j - subelts + nelts);
+ }
+ return fold_permute (f, builder);
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ return e.use_exact_insn (code_for_aarch64_sve_extq (e.vector_mode (0)));
+ }
+};
+
+class svld1q_gather_impl : public full_width_access
+{
+public:
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_READ_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ e.prepare_gather_address_operands (1, false);
+ return e.use_exact_insn (CODE_FOR_aarch64_gather_ld1q);
+ }
+};
+
+class svld1uxq_impl : public ld1uxq_st1xq_base
+{
+public:
+ using ld1uxq_st1xq_base::ld1uxq_st1xq_base;
+
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_READ_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode = code_for_aarch64_sve_ld1_extendq (e.vector_mode (0));
+ return e.use_contiguous_load_insn (icode);
+ }
+};
+
+class svld234q_impl : public ld234q_st234q_base
+{
+public:
+ using ld234q_st234q_base::ld234q_st234q_base;
+
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_READ_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode = code_for_aarch64_sve_ldnq (e.result_mode ());
+ return e.use_contiguous_load_insn (icode);
+ }
+};
+
class svldnt1_gather_impl : public full_width_access
{
public:
@@ -268,6 +400,38 @@ public:
}
};
+class svpmov_impl : public function_base
+{
+public:
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode;
+ if (e.pred == PRED_z)
+ icode = code_for_aarch64_pmov_to (e.vector_mode (0));
+ else
+ icode = code_for_aarch64_pmov_from (e.vector_mode (0));
+ return e.use_exact_insn (icode);
+ }
+};
+
+class svpmov_lane_impl : public function_base
+{
+public:
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode;
+ if (e.pred == PRED_m)
+ icode = code_for_aarch64_pmov_lane_to (e.vector_mode (0));
+ else if (e.args[1] == const0_rtx)
+ icode = code_for_aarch64_pmov_from (e.vector_mode (0));
+ else
+ icode = code_for_aarch64_pmov_lane_from (e.vector_mode (0));
+ return e.use_exact_insn (icode);
+ }
+};
+
class svpsel_lane_impl : public function_base
{
public:
@@ -479,7 +643,7 @@ public:
gimple_call_set_arg (call, 2, imm3_prec);
return call;
}
-public:
+
rtx
expand (function_expander &e) const override
{
@@ -489,6 +653,64 @@ public:
}
};
+class svst1q_scatter_impl : public full_width_access
+{
+public:
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_WRITE_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ rtx data = e.args.last ();
+ e.args.last () = force_lowpart_subreg (VNx2DImode, data, GET_MODE (data));
+ e.prepare_gather_address_operands (1, false);
+ return e.use_exact_insn (CODE_FOR_aarch64_scatter_st1q);
+ }
+};
+
+class svst1xq_impl : public ld1uxq_st1xq_base
+{
+public:
+ using ld1uxq_st1xq_base::ld1uxq_st1xq_base;
+
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_WRITE_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode = code_for_aarch64_sve_st1_truncq (e.vector_mode (0));
+ return e.use_contiguous_store_insn (icode);
+ }
+};
+
+class svst234q_impl : public ld234q_st234q_base
+{
+public:
+ using ld234q_st234q_base::ld234q_st234q_base;
+
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_WRITE_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ machine_mode tuple_mode = GET_MODE (e.args.last ());
+ insn_code icode = code_for_aarch64_sve_stnq (tuple_mode);
+ return e.use_contiguous_store_insn (icode);
+ }
+};
+
class svstnt1_scatter_impl : public full_width_access
{
public:
@@ -562,6 +784,34 @@ public:
}
};
+/* Implements svuzpq1 and svuzpq2. */
+class svuzpq_impl : public binary_permute
+{
+public:
+ CONSTEXPR svuzpq_impl (unsigned int base)
+ : binary_permute (base ? UNSPEC_UZPQ2 : UNSPEC_UZPQ1), m_base (base) {}
+
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ machine_mode mode = f.vector_mode (0);
+ unsigned int subelts = 128U / GET_MODE_UNIT_BITSIZE (mode);
+ poly_uint64 nelts = GET_MODE_NUNITS (mode);
+ vec_perm_builder builder (nelts, subelts, 3);
+ for (unsigned int i = 0; i < 3; ++i)
+ {
+ for (unsigned int j = 0; j < subelts / 2; ++j)
+ builder.quick_push (m_base + j * 2 + i * subelts);
+ for (unsigned int j = 0; j < subelts / 2; ++j)
+ builder.quick_push (m_base + j * 2 + i * subelts + nelts);
+ }
+ return fold_permute (f, builder);
+ }
+
+ /* 0 for svuzpq1, 1 for svuzpq2. */
+ unsigned int m_base;
+};
+
/* Implements both svwhilerw and svwhilewr; the unspec parameter decides
between them. */
class svwhilerw_svwhilewr_impl : public full_width_access
@@ -580,6 +830,34 @@ public:
int m_unspec;
};
+/* Implements svzipq1 and svzipq2. */
+class svzipq_impl : public binary_permute
+{
+public:
+ CONSTEXPR svzipq_impl (unsigned int base)
+ : binary_permute (base ? UNSPEC_ZIPQ2 : UNSPEC_ZIPQ1), m_base (base) {}
+
+ gimple *
+ fold (gimple_folder &f) const override
+ {
+ machine_mode mode = f.vector_mode (0);
+ unsigned int pairs = 64U / GET_MODE_UNIT_BITSIZE (mode);
+ poly_uint64 nelts = GET_MODE_NUNITS (mode);
+ auto base = m_base * pairs;
+ vec_perm_builder builder (nelts, pairs * 2, 3);
+ for (unsigned int i = 0; i < 3; ++i)
+ for (unsigned int j = 0; j < pairs; ++j)
+ {
+ builder.quick_push (base + j + i * pairs * 2);
+ builder.quick_push (base + j + i * pairs * 2 + nelts);
+ }
+ return fold_permute (f, builder);
+ }
+
+ /* 0 for svzipq1, 1 for svzipq2. */
+ unsigned int m_base;
+};
+
} /* end anonymous namespace */
namespace aarch64_sve {
@@ -601,6 +879,7 @@ FUNCTION (svaddlbt, unspec_based_function, (UNSPEC_SADDLBT, -1, -1))
FUNCTION (svaddlt, unspec_based_function, (UNSPEC_SADDLT, UNSPEC_UADDLT, -1))
FUNCTION (svaddp, unspec_based_pred_function, (UNSPEC_ADDP, UNSPEC_ADDP,
UNSPEC_FADDP))
+FUNCTION (svaddqv, reduction, (UNSPEC_ADDQV, UNSPEC_ADDQV, UNSPEC_FADDQV))
FUNCTION (svaddwb, unspec_based_function, (UNSPEC_SADDWB, UNSPEC_UADDWB, -1))
FUNCTION (svaddwt, unspec_based_function, (UNSPEC_SADDWT, UNSPEC_UADDWT, -1))
FUNCTION (svaesd, fixed_insn_function, (CODE_FOR_aarch64_sve2_aesd))
@@ -611,6 +890,7 @@ FUNCTION (svamax, cond_or_uncond_unspec_function,
(UNSPEC_COND_FAMAX, UNSPEC_FAMAX))
FUNCTION (svamin, cond_or_uncond_unspec_function,
(UNSPEC_COND_FAMIN, UNSPEC_FAMIN))
+FUNCTION (svandqv, reduction, (UNSPEC_ANDQV, UNSPEC_ANDQV, -1))
FUNCTION (svbcax, CODE_FOR_MODE0 (aarch64_sve2_bcax),)
FUNCTION (svbdep, unspec_based_function, (UNSPEC_BDEP, UNSPEC_BDEP, -1))
FUNCTION (svbext, unspec_based_function, (UNSPEC_BEXT, UNSPEC_BEXT, -1))
@@ -631,15 +911,24 @@ FUNCTION (svcvtlt, unspec_based_function, (-1, -1, UNSPEC_COND_FCVTLT))
FUNCTION (svcvtn, svcvtn_impl,)
FUNCTION (svcvtx, unspec_based_function, (-1, -1, UNSPEC_COND_FCVTX))
FUNCTION (svcvtxnt, CODE_FOR_MODE1 (aarch64_sve2_cvtxnt),)
+FUNCTION (svdup_laneq, svdup_laneq_impl,)
FUNCTION (sveor3, CODE_FOR_MODE0 (aarch64_sve2_eor3),)
FUNCTION (sveorbt, unspec_based_function, (UNSPEC_EORBT, UNSPEC_EORBT, -1))
+FUNCTION (sveorqv, reduction, (UNSPEC_EORQV, UNSPEC_EORQV, -1))
FUNCTION (sveortb, unspec_based_function, (UNSPEC_EORTB, UNSPEC_EORTB, -1))
+FUNCTION (svextq, svextq_impl,)
FUNCTION (svhadd, unspec_based_function, (UNSPEC_SHADD, UNSPEC_UHADD, -1))
FUNCTION (svhsub, unspec_based_function, (UNSPEC_SHSUB, UNSPEC_UHSUB, -1))
FUNCTION (svhistcnt, CODE_FOR_MODE0 (aarch64_sve2_histcnt),)
FUNCTION (svhistseg, CODE_FOR_MODE0 (aarch64_sve2_histseg),)
FUNCTION (svhsubr, unspec_based_function_rotated, (UNSPEC_SHSUB,
UNSPEC_UHSUB, -1))
+FUNCTION (svld1q_gather, svld1q_gather_impl,)
+FUNCTION (svld1udq, svld1uxq_impl, (VNx1DImode))
+FUNCTION (svld1uwq, svld1uxq_impl, (VNx1SImode))
+FUNCTION (svld2q, svld234q_impl, (2, VNx2TImode))
+FUNCTION (svld3q, svld234q_impl, (3, VNx3TImode))
+FUNCTION (svld4q, svld234q_impl, (4, VNx4TImode))
FUNCTION (svldnt1_gather, svldnt1_gather_impl,)
FUNCTION (svldnt1sb_gather, svldnt1_gather_extend_impl, (TYPE_SUFFIX_s8))
FUNCTION (svldnt1sh_gather, svldnt1_gather_extend_impl, (TYPE_SUFFIX_s16))
@@ -650,11 +939,15 @@ FUNCTION (svldnt1uw_gather, svldnt1_gather_extend_impl, (TYPE_SUFFIX_u32))
FUNCTION (svlogb, unspec_based_function, (-1, -1, UNSPEC_COND_FLOGB))
FUNCTION (svmatch, svmatch_svnmatch_impl, (UNSPEC_MATCH))
FUNCTION (svmaxnmp, unspec_based_pred_function, (-1, -1, UNSPEC_FMAXNMP))
+FUNCTION (svmaxnmqv, reduction, (-1, -1, UNSPEC_FMAXNMQV))
FUNCTION (svmaxp, unspec_based_pred_function, (UNSPEC_SMAXP, UNSPEC_UMAXP,
UNSPEC_FMAXP))
+FUNCTION (svmaxqv, reduction, (UNSPEC_SMAXQV, UNSPEC_UMAXQV, UNSPEC_FMAXQV))
FUNCTION (svminnmp, unspec_based_pred_function, (-1, -1, UNSPEC_FMINNMP))
+FUNCTION (svminnmqv, reduction, (-1, -1, UNSPEC_FMINNMQV))
FUNCTION (svminp, unspec_based_pred_function, (UNSPEC_SMINP, UNSPEC_UMINP,
UNSPEC_FMINP))
+FUNCTION (svminqv, reduction, (UNSPEC_SMINQV, UNSPEC_UMINQV, UNSPEC_FMINQV))
FUNCTION (svmlalb, unspec_based_mla_function, (UNSPEC_SMULLB,
UNSPEC_UMULLB, UNSPEC_FMLALB))
FUNCTION (svmlalb_lane, unspec_based_mla_lane_function, (UNSPEC_SMULLB,
@@ -685,7 +978,10 @@ FUNCTION (svmullt_lane, unspec_based_lane_function, (UNSPEC_SMULLT,
UNSPEC_UMULLT, -1))
FUNCTION (svnbsl, CODE_FOR_MODE0 (aarch64_sve2_nbsl),)
FUNCTION (svnmatch, svmatch_svnmatch_impl, (UNSPEC_NMATCH))
+FUNCTION (svorqv, reduction, (UNSPEC_ORQV, UNSPEC_ORQV, -1))
FUNCTION (svpext_lane, svpext_lane_impl,)
+FUNCTION (svpmov, svpmov_impl,)
+FUNCTION (svpmov_lane, svpmov_lane_impl,)
FUNCTION (svpmul, CODE_FOR_MODE0 (aarch64_sve2_pmul),)
FUNCTION (svpmullb, unspec_based_function, (-1, UNSPEC_PMULLB, -1))
FUNCTION (svpmullb_pair, unspec_based_function, (-1, UNSPEC_PMULLB_PAIR, -1))
@@ -787,6 +1083,12 @@ FUNCTION (svsm4ekey, fixed_insn_function, (CODE_FOR_aarch64_sve2_sm4ekey))
FUNCTION (svsqadd, svsqadd_impl,)
FUNCTION (svsra, svsra_impl,)
FUNCTION (svsri, unspec_based_function, (UNSPEC_SRI, UNSPEC_SRI, -1))
+FUNCTION (svst1dq, svst1xq_impl, (VNx1DImode))
+FUNCTION (svst1q_scatter, svst1q_scatter_impl,)
+FUNCTION (svst1wq, svst1xq_impl, (VNx1SImode))
+FUNCTION (svst2q, svst234q_impl, (2, VNx2TImode))
+FUNCTION (svst3q, svst234q_impl, (3, VNx3TImode))
+FUNCTION (svst4q, svst234q_impl, (4, VNx4TImode))
FUNCTION (svstnt1_scatter, svstnt1_scatter_impl,)
FUNCTION (svstnt1b_scatter, svstnt1_scatter_truncate_impl, (QImode))
FUNCTION (svstnt1h_scatter, svstnt1_scatter_truncate_impl, (HImode))
@@ -800,11 +1102,20 @@ FUNCTION (svsubltb, unspec_based_function, (UNSPEC_SSUBLTB, -1, -1))
FUNCTION (svsubwb, unspec_based_function, (UNSPEC_SSUBWB, UNSPEC_USUBWB, -1))
FUNCTION (svsubwt, unspec_based_function, (UNSPEC_SSUBWT, UNSPEC_USUBWT, -1))
FUNCTION (svtbl2, svtbl2_impl,)
-FUNCTION (svtbx, CODE_FOR_MODE0 (aarch64_sve2_tbx),)
+FUNCTION (svtblq, quiet<unspec_based_uncond_function>, (UNSPEC_TBLQ,
+ UNSPEC_TBLQ,
+ UNSPEC_TBLQ))
+FUNCTION (svtbx, quiet<unspec_based_uncond_function>, (UNSPEC_TBX, UNSPEC_TBX,
+ UNSPEC_TBX))
+FUNCTION (svtbxq, quiet<unspec_based_uncond_function>, (UNSPEC_TBXQ,
+ UNSPEC_TBXQ,
+ UNSPEC_TBXQ))
FUNCTION (svunpk, svunpk_impl,)
FUNCTION (svuqadd, svuqadd_impl,)
FUNCTION (svuzp, multireg_permute, (UNSPEC_UZP))
FUNCTION (svuzpq, multireg_permute, (UNSPEC_UZPQ))
+FUNCTION (svuzpq1, svuzpq_impl, (0))
+FUNCTION (svuzpq2, svuzpq_impl, (1))
FUNCTION (svwhilege, while_comparison, (UNSPEC_WHILEGE, UNSPEC_WHILEHS))
FUNCTION (svwhilegt, while_comparison, (UNSPEC_WHILEGT, UNSPEC_WHILEHI))
FUNCTION (svwhilerw, svwhilerw_svwhilewr_impl, (UNSPEC_WHILERW))
@@ -812,5 +1123,7 @@ FUNCTION (svwhilewr, svwhilerw_svwhilewr_impl, (UNSPEC_WHILEWR))
FUNCTION (svxar, svxar_impl,)
FUNCTION (svzip, multireg_permute, (UNSPEC_ZIP))
FUNCTION (svzipq, multireg_permute, (UNSPEC_ZIPQ))
+FUNCTION (svzipq1, svzipq_impl, (0))
+FUNCTION (svzipq2, svzipq_impl, (1))
} /* end namespace aarch64_sve */
@@ -220,6 +220,35 @@ DEF_SVE_FUNCTION (svsm4e, binary, s_unsigned, none)
DEF_SVE_FUNCTION (svsm4ekey, binary, s_unsigned, none)
#undef REQUIRED_EXTENSIONS
+#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2p1)
+DEF_SVE_FUNCTION (svaddqv, reduction_neonq, all_arith, implicit)
+DEF_SVE_FUNCTION (svandqv, reduction_neonq, all_integer, implicit)
+DEF_SVE_FUNCTION (svdup_laneq, unary_lane, all_data, none)
+DEF_SVE_FUNCTION (sveorqv, reduction_neonq, all_integer, implicit)
+DEF_SVE_FUNCTION (svextq, extq, all_data, none)
+DEF_SVE_FUNCTION (svld2q, load, all_data, implicit)
+DEF_SVE_FUNCTION (svld3q, load, all_data, implicit)
+DEF_SVE_FUNCTION (svld4q, load, all_data, implicit)
+DEF_SVE_FUNCTION (svmaxnmqv, reduction_neonq, all_float, implicit)
+DEF_SVE_FUNCTION (svmaxqv, reduction_neonq, all_arith, implicit)
+DEF_SVE_FUNCTION (svminnmqv, reduction_neonq, all_float, implicit)
+DEF_SVE_FUNCTION (svminqv, reduction_neonq, all_arith, implicit)
+DEF_SVE_FUNCTION (svpmov, pmov_from_vector, all_integer, none)
+DEF_SVE_FUNCTION (svpmov, inherent, all_integer, z)
+DEF_SVE_FUNCTION (svpmov_lane, pmov_from_vector_lane, all_integer, none)
+DEF_SVE_FUNCTION (svpmov_lane, pmov_to_vector_lane, hsd_integer, m)
+DEF_SVE_FUNCTION (svorqv, reduction_neonq, all_integer, implicit)
+DEF_SVE_FUNCTION (svst2q, store, all_data, implicit)
+DEF_SVE_FUNCTION (svst3q, store, all_data, implicit)
+DEF_SVE_FUNCTION (svst4q, store, all_data, implicit)
+DEF_SVE_FUNCTION (svtblq, binary_uint, all_data, none)
+DEF_SVE_FUNCTION (svtbxq, ternary_uint, all_data, none)
+DEF_SVE_FUNCTION (svuzpq1, binary, all_data, none)
+DEF_SVE_FUNCTION (svuzpq2, binary, all_data, none)
+DEF_SVE_FUNCTION (svzipq1, binary, all_data, none)
+DEF_SVE_FUNCTION (svzipq2, binary, all_data, none)
+#undef REQUIRED_EXTENSIONS
+
#define REQUIRED_EXTENSIONS sve_and_sme (AARCH64_FL_SVE2p1, 0)
DEF_SVE_FUNCTION (svclamp, clamp, all_integer, none)
DEF_SVE_FUNCTION (svpsel_lane, select_pred, all_pred_count, none)
@@ -254,6 +283,19 @@ DEF_SVE_FUNCTION_GS (svwhilelt, compare_scalar, while_x, x2, none)
DEF_SVE_FUNCTION (svwhilelt, compare_scalar_count, while_x_c, none)
#undef REQUIRED_EXTENSIONS
+#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2p1)
+DEF_SVE_FUNCTION (svld1q_gather, load_gather64_sv_offset, all_data, implicit)
+DEF_SVE_FUNCTION (svld1q_gather, load_gather64_sv_index, hsd_data, implicit)
+DEF_SVE_FUNCTION (svld1q_gather, load_gather64_vs_offset, all_data, implicit)
+DEF_SVE_FUNCTION (svld1q_gather, load_gather64_vs_index, hsd_data, implicit)
+DEF_SVE_FUNCTION (svld1udq, load, d_data, implicit)
+DEF_SVE_FUNCTION (svld1uwq, load, s_data, implicit)
+DEF_SVE_FUNCTION (svst1dq, store, d_data, implicit)
+DEF_SVE_FUNCTION (svst1q_scatter, store_scatter64_offset, all_data, implicit)
+DEF_SVE_FUNCTION (svst1q_scatter, store_scatter64_index, hsd_data, implicit)
+DEF_SVE_FUNCTION (svst1wq, store, s_data, implicit)
+#undef REQUIRED_EXTENSIONS
+
#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2)
DEF_SVE_FUNCTION_GS (svadd, binary_single, all_integer, x24, none)
DEF_SVE_FUNCTION_GS (svclamp, clamp, all_arith, x24, none)
@@ -38,12 +38,14 @@ namespace aarch64_sve
extern const function_base *const svaddlbt;
extern const function_base *const svaddlt;
extern const function_base *const svaddp;
+ extern const function_base *const svaddqv;
extern const function_base *const svaddwb;
extern const function_base *const svaddwt;
extern const function_base *const svaesd;
extern const function_base *const svaese;
extern const function_base *const svaesimc;
extern const function_base *const svaesmc;
+ extern const function_base *const svandqv;
extern const function_base *const svbcax;
extern const function_base *const svbdep;
extern const function_base *const svbext;
@@ -63,14 +65,23 @@ namespace aarch64_sve
extern const function_base *const svcvtn;
extern const function_base *const svcvtx;
extern const function_base *const svcvtxnt;
+ extern const function_base *const svdup_laneq;
extern const function_base *const sveor3;
extern const function_base *const sveorbt;
+ extern const function_base *const sveorqv;
extern const function_base *const sveortb;
+ extern const function_base *const svextq;
extern const function_base *const svhadd;
extern const function_base *const svhistcnt;
extern const function_base *const svhistseg;
extern const function_base *const svhsub;
extern const function_base *const svhsubr;
+ extern const function_base *const svld1q_gather;
+ extern const function_base *const svld1udq;
+ extern const function_base *const svld1uwq;
+ extern const function_base *const svld2q;
+ extern const function_base *const svld3q;
+ extern const function_base *const svld4q;
extern const function_base *const svldnt1_gather;
extern const function_base *const svldnt1sb_gather;
extern const function_base *const svldnt1sh_gather;
@@ -81,9 +92,13 @@ namespace aarch64_sve
extern const function_base *const svlogb;
extern const function_base *const svmatch;
extern const function_base *const svmaxnmp;
+ extern const function_base *const svmaxnmqv;
extern const function_base *const svmaxp;
+ extern const function_base *const svmaxqv;
extern const function_base *const svminnmp;
+ extern const function_base *const svminnmqv;
extern const function_base *const svminp;
+ extern const function_base *const svminqv;
extern const function_base *const svmlalb;
extern const function_base *const svmlalb_lane;
extern const function_base *const svmlalt;
@@ -100,7 +115,10 @@ namespace aarch64_sve
extern const function_base *const svmullt_lane;
extern const function_base *const svnbsl;
extern const function_base *const svnmatch;
+ extern const function_base *const svorqv;
extern const function_base *const svpext_lane;
+ extern const function_base *const svpmov;
+ extern const function_base *const svpmov_lane;
extern const function_base *const svpmul;
extern const function_base *const svpmullb;
extern const function_base *const svpmullb_pair;
@@ -180,6 +198,12 @@ namespace aarch64_sve
extern const function_base *const svsqadd;
extern const function_base *const svsra;
extern const function_base *const svsri;
+ extern const function_base *const svst1dq;
+ extern const function_base *const svst1q_scatter;
+ extern const function_base *const svst1wq;
+ extern const function_base *const svst2q;
+ extern const function_base *const svst3q;
+ extern const function_base *const svst4q;
extern const function_base *const svstnt1_scatter;
extern const function_base *const svstnt1b_scatter;
extern const function_base *const svstnt1h_scatter;
@@ -193,11 +217,15 @@ namespace aarch64_sve
extern const function_base *const svsubwb;
extern const function_base *const svsubwt;
extern const function_base *const svtbl2;
+ extern const function_base *const svtblq;
extern const function_base *const svtbx;
+ extern const function_base *const svtbxq;
extern const function_base *const svunpk;
extern const function_base *const svuqadd;
extern const function_base *const svuzp;
extern const function_base *const svuzpq;
+ extern const function_base *const svuzpq1;
+ extern const function_base *const svuzpq2;
extern const function_base *const svwhilege;
extern const function_base *const svwhilegt;
extern const function_base *const svwhilerw;
@@ -205,6 +233,8 @@ namespace aarch64_sve
extern const function_base *const svxar;
extern const function_base *const svzip;
extern const function_base *const svzipq;
+ extern const function_base *const svzipq1;
+ extern const function_base *const svzipq2;
}
}
@@ -334,6 +334,11 @@ CONSTEXPR const group_suffix_info group_suffixes[] = {
#define TYPES_hsd_integer(S, D) \
TYPES_hsd_signed (S, D), S (u16), S (u32), S (u64)
+#define TYPES_hsd_data(S, D) \
+ TYPES_h_data (S, D), \
+ TYPES_s_data (S, D), \
+ TYPES_d_data (S, D)
+
/* _f32. */
#define TYPES_s_float(S, D) \
S (f32)
@@ -742,12 +747,14 @@ DEF_SVE_TYPES_ARRAY (hs_data);
DEF_SVE_TYPES_ARRAY (hd_unsigned);
DEF_SVE_TYPES_ARRAY (hsd_signed);
DEF_SVE_TYPES_ARRAY (hsd_integer);
+DEF_SVE_TYPES_ARRAY (hsd_data);
DEF_SVE_TYPES_ARRAY (s_float);
DEF_SVE_TYPES_ARRAY (s_float_hsd_integer);
DEF_SVE_TYPES_ARRAY (s_float_sd_integer);
DEF_SVE_TYPES_ARRAY (s_signed);
DEF_SVE_TYPES_ARRAY (s_unsigned);
DEF_SVE_TYPES_ARRAY (s_integer);
+DEF_SVE_TYPES_ARRAY (s_data);
DEF_SVE_TYPES_ARRAY (sd_signed);
DEF_SVE_TYPES_ARRAY (sd_unsigned);
DEF_SVE_TYPES_ARRAY (sd_integer);
@@ -2036,6 +2043,15 @@ function_resolver::infer_pointer_type (unsigned int argno,
actual, argno + 1, fndecl);
return NUM_TYPE_SUFFIXES;
}
+ if (displacement_units () == UNITS_elements && bits == 8)
+ {
+ error_at (location, "passing %qT to argument %d of %qE, which"
+ " expects the data to be 16 bits or wider",
+ actual, argno + 1, fndecl);
+ inform (location, "use the %<offset%> rather than %<index%> form"
+ " for 8-bit data");
+ return NUM_TYPE_SUFFIXES;
+ }
return type;
}
@@ -2827,7 +2843,8 @@ function_resolver::resolve_sv_displacement (unsigned int argno,
}
}
- if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
+ if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES
+ && shape->vector_base_type (TYPE_SUFFIX_u32) == TYPE_SUFFIX_u32)
{
/* TYPE has been inferred rather than specified by the user,
so mention it in the error messages. */
@@ -9018,6 +9018,7 @@ (define_insn "mask_fold_left_plus_<mode>"
;; -------------------------------------------------------------------------
;; Includes:
;; - TBL
+;; - TBLQ (SVE2p1)
;; -------------------------------------------------------------------------
(define_expand "vec_perm<mode>"
@@ -9033,14 +9034,14 @@ (define_expand "vec_perm<mode>"
}
)
-(define_insn "@aarch64_sve_tbl<mode>"
+(define_insn "@aarch64_sve_<perm_insn><mode>"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand:SVE_FULL 1 "register_operand" "w")
(match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
- UNSPEC_TBL))]
+ SVE_TBL))]
"TARGET_SVE"
- "tbl\t%0.<Vetype>, {%1.<Vetype>}, %2.<Vetype>"
+ "<perm_insn>\t%0.<Vetype>, {%1.<Vetype>}, %2.<Vetype>"
)
;; -------------------------------------------------------------------------
@@ -9129,9 +9130,13 @@ (define_insn "@aarch64_sve_rev<mode>"
;; - TRN1
;; - TRN2
;; - UZP1
+;; - UZPQ1 (SVE2p1)
;; - UZP2
+;; - UZPQ2 (SVE2p1)
;; - ZIP1
+;; - ZIPQ1 (SVE2p1)
;; - ZIP2
+;; - ZIPQ2 (SVE2p1)
;; -------------------------------------------------------------------------
;; Like EXT, but start at the first active element.
@@ -9156,7 +9161,7 @@ (define_insn "@aarch64_sve_<perm_insn><mode>"
(unspec:SVE_ALL
[(match_operand:SVE_ALL 1 "register_operand" "w")
(match_operand:SVE_ALL 2 "register_operand" "w")]
- PERMUTE))]
+ SVE_PERMUTE))]
"TARGET_SVE"
"<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
)
@@ -21,12 +21,22 @@
;; The file is organised into the following sections (search for the full
;; line):
;;
+;; == Moves
+;; ---- Predicate to vector moves
+;; ---- Vector to predicate moves
+;;
;; == Loads
+;; ---- 128-bit extending loads
+;; ---- 128-bit structure loads
;; ---- Multi-register loads predicated by a counter
+;; ---- 128-bit gather loads
;; ---- Non-temporal gather loads
;;
;; == Stores
+;; ---- 128-bit truncating stores
+;; ---- 128-bit structure stores
;; ---- Multi-register stores predicated by a counter
+;; ---- 128-bit scatter stores
;; ---- Non-temporal scatter stores
;;
;; == Predicate manipulation
@@ -99,8 +109,13 @@
;; ---- [INT,FP] Select based on predicates as counters
;; ---- [INT] While tests
;;
+;; == Reductions
+;; ---- [INT] Reduction to 128-bit vector
+;; ---- [FP] Reduction to 128-bit vector
+;;
;; == Permutation
;; ---- [INT,FP] Reversal
+;; ---- [INT,FP] HVLA permutes
;; ---- [INT,FP] General permutes
;; ---- [INT,FP] Multi-register permutes
;; ---- [INT] Optional bit-permute extensions
@@ -115,10 +130,121 @@
;; ---- Optional SHA-3 extensions
;; ---- Optional SM4 extensions
+;; =========================================================================
+;; == Moves
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- Predicate to vector moves
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - PMOV (to vector)
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_pmov_to_<mode>"
+ [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+ (unspec:SVE_FULL_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upa")]
+ UNSPEC_PMOV_UNPACK))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "pmov\t%0, %1.<Vetype>"
+)
+
+(define_insn "@aarch64_pmov_lane_to_<mode>"
+ [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+ (unspec:SVE_FULL_I
+ [(match_operand:SVE_FULL_I 1 "register_operand" "0")
+ (match_operand:<VPRED> 2 "register_operand" "Upa")
+ (match_operand:DI 3 "immediate_operand")]
+ UNSPEC_PMOV_UNPACK_LANE))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "pmov\t%0[%3], %2.<Vetype>"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Vector to predicate moves
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - PMOV (from vector)
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_pmov_from_<mode>"
+ [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+ (unspec:VNx16BI
+ [(match_operand:SVE_FULL_I 1 "register_operand" "w")]
+ UNSPEC_PMOV_PACK))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "pmov\t%0.<Vetype>, %1"
+)
+
+(define_insn "@aarch64_pmov_lane_from_<mode>"
+ [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+ (unspec:VNx16BI
+ [(match_operand:SVE_FULL_I 1 "register_operand" "w")
+ (match_operand:DI 2 "immediate_operand")]
+ UNSPEC_PMOV_PACK_LANE))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "pmov\t%0.<Vetype>, %1[%2]"
+)
+
;; =========================================================================
;; == Loads
;; =========================================================================
+;; -------------------------------------------------------------------------
+;; ---- 128-bit extending loads
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LD1W (to .Q)
+;; - LD1D (to .Q)
+;; -------------------------------------------------------------------------
+
+;; There isn't really a natural way of representing these instructions
+;; with the modes that we normally use:
+;;
+;; (1) It doesn't really make sense to use VNx1TI (or similar) for the
+;; result, since there's nothing that can be done with such a mode
+;; other than to cast it to another mode. It also isn't how the
+;; ACLE represents it (for similar reasons).
+;;
+;; (2) Only the lowest bit of each 16 in the predicate is significant,
+;; but it doesn't really make sense to use VNx1BI to represent it,
+;; since there is no "PTRUE Pn.Q, ..." instruction.
+;;
+;; (3) We do however need to use VNx1DI and VNx1SI to represent the
+;; source memories, since none of the normal register modes would
+;; give the right extent and alignment information (with the alignment
+;; mattering only for -mstrict-align).
+(define_insn "@aarch64_sve_ld1_extendq<mode>"
+ [(set (match_operand:SVE_FULL_SD 0 "register_operand" "=w")
+ (unspec:SVE_FULL_SD
+ [(match_operand:<VPRED> 2 "register_operand" "Upl")
+ (match_operand:<LD1_EXTENDQ_MEM> 1 "memory_operand" "m")]
+ UNSPEC_LD1_EXTENDQ))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "ld1<Vesize>\t{%0.q}, %2/z, %1"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- 128-bit structure loads
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LD2Q
+;; - LD3Q
+;; - LD4Q
+;; -------------------------------------------------------------------------
+
+;; Predicated LD[234]Q.
+(define_insn "@aarch64_sve_ldnq<mode>"
+ [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
+ (unspec:SVE_STRUCT
+ [(match_operand:<VPRED> 2 "register_operand" "Upl")
+ (match_operand:<VNxTI> 1 "memory_operand" "m")]
+ UNSPEC_LDNQ))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "ld<vector_count>q\t{%S0.q - %<Vendreg>0.q}, %2/z, %1"
+)
+
;; -------------------------------------------------------------------------
;; ---- Multi-register loads predicated by a counter
;; -------------------------------------------------------------------------
@@ -195,6 +321,33 @@ (define_insn "@aarch64_<optab><mode>_strided4"
[(set_attr "stride_type" "ld1_strided")]
)
+;; -------------------------------------------------------------------------
+;; ---- 128-bit gather loads
+;; -------------------------------------------------------------------------
+;; Includes gather forms of:
+;; - LD1Q
+;; -------------------------------------------------------------------------
+
+;; Model this as operating on the largest valid element size, which is DI.
+;; This avoids having to define move patterns & more for VNx1TI, which would
+;; be difficult without a non-gather form of LD1Q.
+(define_insn "aarch64_gather_ld1q"
+ [(set (match_operand:VNx2DI 0 "register_operand")
+ (unspec:VNx2DI
+ [(match_operand:VNx2BI 1 "register_operand")
+ (match_operand:DI 2 "aarch64_reg_or_zero")
+ (match_operand:VNx2DI 3 "register_operand")
+ (mem:BLK (scratch))]
+ UNSPEC_LD1_GATHER))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ {@ [cons: =0, 1, 2, 3]
+ [&w, Upl, Z, w] ld1q\t{%0.q}, %1/z, [%3.d]
+ [?w, Upl, Z, 0] ^
+ [&w, Upl, r, w] ld1q\t{%0.q}, %1/z, [%3.d, %2]
+ [?w, Upl, r, 0] ^
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- Non-temporal gather loads
;; -------------------------------------------------------------------------
@@ -255,6 +408,48 @@ (define_insn_and_rewrite "@aarch64_gather_ldnt_<ANY_EXTEND:optab><SVE_FULL_SDI:m
;; == Stores
;; =========================================================================
+;; -------------------------------------------------------------------------
+;; ---- 128-bit truncating stores
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ST1W (from .Q)
+;; - ST1D (from .Q)
+;; -------------------------------------------------------------------------
+
+;; See the comment above the corresponding loads for a discussion about the
+;; choice of modes.
+(define_insn "@aarch64_sve_st1_truncq<mode>"
+ [(set (match_operand:<LD1_EXTENDQ_MEM> 0 "memory_operand" "+m")
+ (unspec:<LD1_EXTENDQ_MEM>
+ [(match_operand:<VPRED> 2 "register_operand" "Upl")
+ (match_operand:SVE_FULL_SD 1 "register_operand" "w")
+ (match_dup 0)]
+ UNSPEC_ST1_TRUNCQ))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "st1<Vesize>\t{%1.q}, %2, %0"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- 128-bit structure stores
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ST2Q
+;; - ST3Q
+;; - ST4Q
+;; -------------------------------------------------------------------------
+
+;; Predicated ST[234].
+(define_insn "@aarch64_sve_stnq<mode>"
+ [(set (match_operand:<VNxTI> 0 "memory_operand" "+m")
+ (unspec:<VNxTI>
+ [(match_operand:<VPRED> 2 "register_operand" "Upl")
+ (match_operand:SVE_STRUCT 1 "register_operand" "w")
+ (match_dup 0)]
+ UNSPEC_STNQ))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "st<vector_count>q\t{%S1.q - %<Vendreg>1.q}, %2, %0"
+)
+
;; -------------------------------------------------------------------------
;; ---- Multi-register stores predicated by a counter
;; -------------------------------------------------------------------------
@@ -311,6 +506,28 @@ (define_insn "@aarch64_<optab><mode>_strided4"
[(set_attr "stride_type" "st1_strided")]
)
+;; -------------------------------------------------------------------------
+;; ---- 128-bit scatter stores
+;; -------------------------------------------------------------------------
+;; Includes scatter form of:
+;; - ST1Q
+;; -------------------------------------------------------------------------
+
+(define_insn "aarch64_scatter_st1q"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:VNx2BI 0 "register_operand")
+ (match_operand:DI 1 "aarch64_reg_or_zero")
+ (match_operand:VNx2DI 2 "register_operand")
+ (match_operand:VNx2DI 3 "register_operand")]
+ UNSPEC_ST1Q_SCATTER))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ {@ [ cons: 0 , 1 , 2 , 3 ]
+ [ Upl , Z , w , w ] st1q\t{%3.q}, %0, [%2.d]
+ [ Upl , r , w , w ] st1q\t{%3.q}, %0, [%2.d, %1]
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- Non-temporal scatter stores
;; -------------------------------------------------------------------------
@@ -3171,6 +3388,55 @@ (define_insn "@aarch64_sve_while<while_optab_cmp>_c<BHSD_BITS>"
"while<cmp_op>\t%K0.<bits_etype>, %x1, %x2, vlx%3"
)
+;; =========================================================================
+;; == Reductions
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Reduction to 128-bit vector
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ADDQV
+;; - ANDQV
+;; - EORQV
+;; - ORQV
+;; - SMAXQV
+;; - SMINQV
+;; - UMAXQV
+;; - UMINQV
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
+ [(set (match_operand:<V128> 0 "register_operand" "=w")
+ (unspec:<V128>
+ [(match_operand:<VPRED> 1 "register_operand" "Upl")
+ (match_operand:SVE_FULL_I 2 "register_operand" "w")]
+ SVE_INT_REDUCTION_128))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "<optab>\t%0.<Vtype>, %1, %2.<Vetype>"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Reduction to 128-bit vector
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FADDQV
+;; - FMAXNMQV
+;; - FMAXQV
+;; - FMINNMQV
+;; - FMINQV
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
+ [(set (match_operand:<V128> 0 "register_operand" "=w")
+ (unspec:<V128>
+ [(match_operand:<VPRED> 1 "register_operand" "Upl")
+ (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+ SVE_FP_REDUCTION_128))]
+ "TARGET_SVE2p1 && TARGET_NON_STREAMING"
+ "<optab>\t%0.<Vtype>, %1, %2.<Vetype>"
+)
+
;; =========================================================================
;; == Permutation
;; =========================================================================
@@ -3213,12 +3479,52 @@ (define_insn "@cond_<optab><mode>"
}
)
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] HVLA permutes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DUPQ
+;; - EXTQ
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_dupq<mode>"
+ [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+ (unspec:SVE_FULL
+ [(match_operand:SVE_FULL 1 "register_operand" "w")
+ (match_operand:SI 2 "const_int_operand")]
+ UNSPEC_DUPQ))]
+ "TARGET_SVE2p1
+ && TARGET_NON_STREAMING
+ && IN_RANGE (INTVAL (operands[2]) * (<elem_bits> / 8), 0, 15)"
+ "dupq\t%0.<Vetype>, %1.<Vetype>[%2]"
+)
+
+(define_insn "@aarch64_sve_extq<mode>"
+ [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_FULL
+ [(match_operand:SVE_FULL 1 "register_operand" "0, w")
+ (match_operand:SVE_FULL 2 "register_operand" "w, w")
+ (match_operand:SI 3 "const_int_operand")]
+ UNSPEC_EXTQ))]
+ "TARGET_SVE2p1
+ && TARGET_NON_STREAMING
+ && IN_RANGE (INTVAL (operands[3]) * (<elem_bits> / 8), 0, 15)"
+ {
+ operands[3] = GEN_INT (INTVAL (operands[3]) * (<elem_bits> / 8));
+ return (which_alternative == 0
+ ? "extq\\t%0.b, %0.b, %2.b, #%3"
+ : "movprfx\t%0, %1\;extq\\t%0.b, %0.b, %2.b, #%3");
+ }
+ [(set_attr "movprfx" "*,yes")]
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT,FP] General permutes
;; -------------------------------------------------------------------------
;; Includes:
;; - TBL (vector pair form)
;; - TBX
+;; - TBXQ (SVE2p1)
;; -------------------------------------------------------------------------
;; TBL on a pair of data vectors.
@@ -3232,16 +3538,16 @@ (define_insn "@aarch64_sve2_tbl2<mode>"
"tbl\t%0.<Vetype>, %1, %2.<Vetype>"
)
-;; TBX. These instructions do not take MOVPRFX.
-(define_insn "@aarch64_sve2_tbx<mode>"
+;; TBX(Q). These instructions do not take MOVPRFX.
+(define_insn "@aarch64_sve_<perm_insn><mode>"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand:SVE_FULL 1 "register_operand" "0")
(match_operand:SVE_FULL 2 "register_operand" "w")
(match_operand:<V_INT_EQUIV> 3 "register_operand" "w")]
- UNSPEC_TBX))]
+ SVE_TBX))]
"TARGET_SVE2"
- "tbx\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>"
+ "<perm_insn>\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>"
)
;; -------------------------------------------------------------------------
@@ -1692,6 +1692,32 @@ aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
}
}
+/* Like aarch64_classify_vector_mode, but also include modes that are used
+ for memory operands but not register operands. Such modes do not count
+ as real vector modes; they are just an internal construct to make things
+ easier to describe. */
+static unsigned int
+aarch64_classify_vector_memory_mode (machine_mode mode)
+{
+ switch (mode)
+ {
+ case VNx1SImode:
+ case VNx1DImode:
+ return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
+
+ case VNx1TImode:
+ return TARGET_SVE ? VEC_SVE_DATA : 0;
+
+ case VNx2TImode:
+ case VNx3TImode:
+ case VNx4TImode:
+ return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
+
+ default:
+ return aarch64_classify_vector_mode (mode);
+ }
+}
+
/* Return true if MODE is any of the Advanced SIMD structure modes. */
bool
aarch64_advsimd_struct_mode_p (machine_mode mode)
@@ -2578,7 +2604,9 @@ aarch64_regmode_natural_size (machine_mode mode)
code for Advanced SIMD. */
if (!aarch64_sve_vg.is_constant ())
{
- unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ /* REGMODE_NATURAL_SIZE influences general subreg validity rules,
+ so we need to handle memory-only modes as well. */
+ unsigned int vec_flags = aarch64_classify_vector_memory_mode (mode);
if (vec_flags & VEC_SVE_PRED)
return BYTES_PER_SVE_PRED;
if (vec_flags & VEC_SVE_DATA)
@@ -10484,7 +10512,8 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
&& contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
index = SUBREG_REG (index);
- if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
+ auto vec_flags = aarch64_classify_vector_memory_mode (mode);
+ if (vec_flags & VEC_SVE_DATA)
{
if (type != ADDRESS_REG_REG
|| (1 << shift) != GET_MODE_UNIT_SIZE (mode))
@@ -10555,7 +10584,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
Partial vectors like VNx8QImode allow the same indexed addressing
mode and MUL VL addressing mode as full vectors like VNx16QImode;
in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
- unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ unsigned int vec_flags = aarch64_classify_vector_memory_mode (mode);
vec_flags &= ~VEC_PARTIAL;
/* On BE, we use load/store pair for all large int mode load/stores.
@@ -10591,8 +10620,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
&& ((vec_flags == 0
&& known_lt (GET_MODE_SIZE (mode), 16))
|| vec_flags == VEC_ADVSIMD
- || vec_flags & VEC_SVE_DATA
- || mode == VNx1TImode));
+ || vec_flags & VEC_SVE_DATA));
/* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
The latter is not valid for SVE predicates, and that's rejected through
@@ -10711,7 +10739,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
/* Make "m" use the LD1 offset range for SVE data modes, so
that pre-RTL optimizers like ivopts will work to that
instead of the wider LDR/STR range. */
- if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
+ if (vec_flags == VEC_SVE_DATA)
return (type == ADDR_QUERY_M
? offset_4bit_signed_scaled_p (mode, offset)
: offset_9bit_signed_scaled_p (mode, offset));
@@ -12029,7 +12057,7 @@ sizetochar (int size)
case 64: return 'd';
case 32: return 's';
case 16: return 'h';
- case 8 : return 'b';
+ case 8: return 'b';
default: gcc_unreachable ();
}
}
@@ -12611,7 +12639,7 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
return true;
}
- vec_flags = aarch64_classify_vector_mode (mode);
+ vec_flags = aarch64_classify_vector_memory_mode (mode);
if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
{
HOST_WIDE_INT vnum
@@ -26238,6 +26266,107 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize things that can be done using the SVE2p1 Hybrid-VLA
+ permutations, which apply Advanced-SIMD-style permutations to each
+ individual 128-bit block. */
+
+static bool
+aarch64_evpc_hvla (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ if (!TARGET_SVE2p1
+ || !TARGET_NON_STREAMING
+ || BYTES_BIG_ENDIAN
+ || d->vec_flags != VEC_SVE_DATA
+ || GET_MODE_UNIT_BITSIZE (vmode) > 64)
+ return false;
+
+ /* Set SUBELTS to the number of elements in an Advanced SIMD vector
+ and make sure that adding SUBELTS to each block of SUBELTS indices
+ gives the next block of SUBELTS indices. That is, it must be possible
+ to interpret the index vector as SUBELTS interleaved linear series in
+ which each series has step SUBELTS. */
+ unsigned int subelts = 128U / GET_MODE_UNIT_BITSIZE (vmode);
+ unsigned int pairs = subelts / 2;
+ for (unsigned int i = 0; i < subelts; ++i)
+ if (!d->perm.series_p (i, subelts, d->perm[i], subelts))
+ return false;
+
+ /* Used once we have verified that we can use UNSPEC to do the operation. */
+ auto use_binary = [&](int unspec) -> bool
+ {
+ if (!d->testing_p)
+ {
+ rtvec vec = gen_rtvec (2, d->op0, d->op1);
+ emit_set_insn (d->target, gen_rtx_UNSPEC (vmode, vec, unspec));
+ }
+ return true;
+ };
+
+ /* Now check whether the first SUBELTS elements match a supported
+ Advanced-SIMD-style operation. */
+ poly_int64 first = d->perm[0];
+ poly_int64 nelt = d->perm.length ();
+ auto try_zip = [&]() -> bool
+ {
+ if (maybe_ne (first, 0) && maybe_ne (first, pairs))
+ return false;
+ for (unsigned int i = 0; i < pairs; ++i)
+ if (maybe_ne (d->perm[i * 2], first + i)
+ || maybe_ne (d->perm[i * 2 + 1], first + nelt + i))
+ return false;
+ return use_binary (maybe_ne (first, 0) ? UNSPEC_ZIPQ2 : UNSPEC_ZIPQ1);
+ };
+ auto try_uzp = [&]() -> bool
+ {
+ if (maybe_ne (first, 0) && maybe_ne (first, 1))
+ return false;
+ for (unsigned int i = 0; i < pairs; ++i)
+ if (maybe_ne (d->perm[i], first + i * 2)
+ || maybe_ne (d->perm[i + pairs], first + nelt + i * 2))
+ return false;
+ return use_binary (maybe_ne (first, 0) ? UNSPEC_UZPQ2 : UNSPEC_UZPQ1);
+ };
+ auto try_extq = [&]() -> bool
+ {
+ HOST_WIDE_INT start;
+ if (!first.is_constant (&start) || !IN_RANGE (start, 0, subelts - 1))
+ return false;
+ for (unsigned int i = 0; i < subelts; ++i)
+ {
+ poly_int64 next = (start + i >= subelts
+ ? start + i - subelts + nelt
+ : start + i);
+ if (maybe_ne (d->perm[i], next))
+ return false;
+ }
+ if (!d->testing_p)
+ {
+ rtx op2 = gen_int_mode (start, SImode);
+ emit_insn (gen_aarch64_sve_extq (vmode, d->target,
+ d->op0, d->op1, op2));
+ }
+ return true;
+ };
+ auto try_dupq = [&]() -> bool
+ {
+ HOST_WIDE_INT start;
+ if (!first.is_constant (&start) || !IN_RANGE (start, 0, subelts - 1))
+ return false;
+ for (unsigned int i = 0; i < subelts; ++i)
+ if (maybe_ne (d->perm[i], start))
+ return false;
+ if (!d->testing_p)
+ {
+ rtx op1 = gen_int_mode (start, SImode);
+ emit_insn (gen_aarch64_sve_dupq (vmode, d->target, d->op0, op1));
+ }
+ return true;
+ };
+
+ return try_zip () || try_uzp () || try_extq () || try_dupq ();
+}
+
static bool
aarch64_evpc_tbl (struct expand_vec_perm_d *d)
{
@@ -26514,6 +26643,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
else if (aarch64_evpc_ins (d))
return true;
+ else if (aarch64_evpc_hvla (d))
+ return true;
else if (aarch64_evpc_reencode (d))
return true;
@@ -734,7 +734,9 @@ (define_c_enum "unspec"
UNSPEC_USHLL ; Used in aarch64-simd.md.
UNSPEC_ADDP ; Used in aarch64-simd.md.
UNSPEC_TBL ; Used in vector permute patterns.
+ UNSPEC_TBLQ ; Used in vector permute patterns.
UNSPEC_TBX ; Used in vector permute patterns.
+ UNSPEC_TBXQ ; Used in vector permute patterns.
UNSPEC_CONCAT ; Used in vector permute patterns.
;; The following permute unspecs are generated directly by
@@ -1071,14 +1073,43 @@ (define_c_enum "unspec"
UNSPEC_FAMIN ; Used in aarch64-simd.md.
;; All used in aarch64-sve2.md
+ UNSPEC_ADDQV
+ UNSPEC_ANDQV
+ UNSPEC_DUPQ
+ UNSPEC_EORQV
+ UNSPEC_EXTQ
+ UNSPEC_FADDQV
+ UNSPEC_FMAXQV
+ UNSPEC_FMAXNMQV
+ UNSPEC_FMINQV
+ UNSPEC_FMINNMQV
UNSPEC_FCVTN
UNSPEC_FDOT
+ UNSPEC_LD1_EXTENDQ
+ UNSPEC_LD1Q_GATHER
+ UNSPEC_LDNQ
+ UNSPEC_ORQV
+ UNSPEC_PMOV_PACK
+ UNSPEC_PMOV_PACK_LANE
+ UNSPEC_PMOV_UNPACK
+ UNSPEC_PMOV_UNPACK_LANE
+ UNSPEC_SMAXQV
+ UNSPEC_SMINQV
UNSPEC_SQCVT
UNSPEC_SQCVTN
UNSPEC_SQCVTU
UNSPEC_SQCVTUN
+ UNSPEC_ST1_TRUNCQ
+ UNSPEC_ST1Q_SCATTER
+ UNSPEC_STNQ
+ UNSPEC_UMAXQV
+ UNSPEC_UMINQV
UNSPEC_UQCVT
UNSPEC_UQCVTN
+ UNSPEC_UZPQ1
+ UNSPEC_UZPQ2
+ UNSPEC_ZIPQ1
+ UNSPEC_ZIPQ2
;; All used in aarch64-sme.md
UNSPEC_SME_ADD
@@ -1326,7 +1357,11 @@ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
(V4x16QI "16b") (V4x8HI "8h")
(V4x4SI "4s") (V4x2DI "2d")
(V4x8HF "8h") (V4x4SF "4s")
- (V4x2DF "2d") (V4x8BF "8h")])
+ (V4x2DF "2d") (V4x8BF "8h")
+ (VNx16QI "16b") (VNx8HI "8h")
+ (VNx4SI "4s") (VNx2DI "2d")
+ (VNx8HF "8h") (VNx4SF "4s")
+ (VNx2DF "2d") (VNx8BF "8h")])
;; Map mode to type used in widening multiplies.
(define_mode_attr Vcondtype [(V4HI "4h") (V8HI "4h") (V2SI "2s") (V4SI "2s")])
@@ -1994,7 +2029,22 @@ (define_mode_attr Vendreg [(OI "T") (CI "U") (XI "V")
(V4x4HF "V") (V4x8HF "V")
(V4x2SF "V") (V4x4SF "V")
(V4x1DF "V") (V4x2DF "V")
- (V4x4BF "V") (V4x8BF "V")])
+ (V4x4BF "V") (V4x8BF "V")
+
+ (VNx32QI "T") (VNx16HI "T")
+ (VNx8SI "T") (VNx4DI "T")
+ (VNx16BF "T") (VNx16HF "T")
+ (VNx8SF "T") (VNx4DF "T")
+
+ (VNx48QI "U") (VNx24HI "U")
+ (VNx12SI "U") (VNx6DI "U")
+ (VNx24BF "U") (VNx24HF "U")
+ (VNx12SF "U") (VNx6DF "U")
+
+ (VNx64QI "V") (VNx32HI "V")
+ (VNx16SI "V") (VNx8DI "V")
+ (VNx32BF "V") (VNx32HF "V")
+ (VNx16SF "V") (VNx8DF "V")])
;; This is both the number of Q-Registers needed to hold the corresponding
;; opaque large integer mode, and the number of elements touched by the
@@ -2338,6 +2388,21 @@ (define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
+(define_mode_attr VNxTI [(VNx32QI "VNx2TI") (VNx16HI "VNx2TI")
+ (VNx8SI "VNx2TI") (VNx4DI "VNx2TI")
+ (VNx16BF "VNx2TI") (VNx16HF "VNx2TI")
+ (VNx8SF "VNx2TI") (VNx4DF "VNx2TI")
+
+ (VNx48QI "VNx3TI") (VNx24HI "VNx3TI")
+ (VNx12SI "VNx3TI") (VNx6DI "VNx3TI")
+ (VNx24BF "VNx3TI") (VNx24HF "VNx3TI")
+ (VNx12SF "VNx3TI") (VNx6DF "VNx3TI")
+
+ (VNx64QI "VNx4TI") (VNx32HI "VNx4TI")
+ (VNx16SI "VNx4TI") (VNx8DI "VNx4TI")
+ (VNx32BF "VNx4TI") (VNx32HF "VNx4TI")
+ (VNx16SF "VNx4TI") (VNx8DF "VNx4TI")])
+
;; The Advanced SIMD modes of popcount corresponding to scalar modes.
(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
(SI "V2SI") (DI "V1DI")])
@@ -2448,6 +2513,9 @@ (define_mode_attr aligned_fpr [(VNx16QI "w") (VNx8HI "w")
(VNx64QI "Uw4") (VNx32HI "Uw4")
(VNx32BF "Uw4") (VNx32HF "Uw4")])
+(define_mode_attr LD1_EXTENDQ_MEM [(VNx4SI "VNx1SI") (VNx4SF "VNx1SI")
+ (VNx2DI "VNx1DI") (VNx2DF "VNx1DI")])
+
;; -------------------------------------------------------------------
;; Code Iterators
;; -------------------------------------------------------------------
@@ -2973,6 +3041,21 @@ (define_int_iterator PERMUTE [UNSPEC_ZIP1 UNSPEC_ZIP2
UNSPEC_TRN1 UNSPEC_TRN2
UNSPEC_UZP1 UNSPEC_UZP2])
+(define_int_iterator SVE_PERMUTE
+ [PERMUTE
+ (UNSPEC_UZPQ1 "TARGET_SVE2p1 && TARGET_NON_STREAMING")
+ (UNSPEC_UZPQ2 "TARGET_SVE2p1 && TARGET_NON_STREAMING")
+ (UNSPEC_ZIPQ1 "TARGET_SVE2p1 && TARGET_NON_STREAMING")
+ (UNSPEC_ZIPQ2 "TARGET_SVE2p1 && TARGET_NON_STREAMING")])
+
+(define_int_iterator SVE_TBL
+ [UNSPEC_TBL
+ (UNSPEC_TBLQ "TARGET_SVE2p1 && TARGET_NON_STREAMING")])
+
+(define_int_iterator SVE_TBX
+ [UNSPEC_TBX
+ (UNSPEC_TBXQ "TARGET_SVE2p1 && TARGET_NON_STREAMING")])
+
(define_int_iterator PERMUTEQ [UNSPEC_ZIP1Q UNSPEC_ZIP2Q
UNSPEC_TRN1Q UNSPEC_TRN2Q
UNSPEC_UZP1Q UNSPEC_UZP2Q])
@@ -3072,12 +3155,27 @@ (define_int_iterator SVE_INT_REDUCTION [UNSPEC_ANDV
UNSPEC_UMINV
UNSPEC_XORV])
+(define_int_iterator SVE_INT_REDUCTION_128 [UNSPEC_ADDQV
+ UNSPEC_ANDQV
+ UNSPEC_EORQV
+ UNSPEC_ORQV
+ UNSPEC_SMAXQV
+ UNSPEC_SMINQV
+ UNSPEC_UMAXQV
+ UNSPEC_UMINQV])
+
(define_int_iterator SVE_FP_REDUCTION [UNSPEC_FADDV
UNSPEC_FMAXV
UNSPEC_FMAXNMV
UNSPEC_FMINV
UNSPEC_FMINNMV])
+(define_int_iterator SVE_FP_REDUCTION_128 [UNSPEC_FADDQV
+ UNSPEC_FMAXQV
+ UNSPEC_FMAXNMQV
+ UNSPEC_FMINQV
+ UNSPEC_FMINNMQV])
+
(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FABS
UNSPEC_COND_FNEG
UNSPEC_COND_FRECPX
@@ -3629,6 +3727,8 @@ (define_int_attr optab [(UNSPEC_ANDF "and")
(UNSPEC_UMINV "umin")
(UNSPEC_SMAXV "smax")
(UNSPEC_SMINV "smin")
+ (UNSPEC_ADDQV "addqv")
+ (UNSPEC_ANDQV "andqv")
(UNSPEC_CADD90 "cadd90")
(UNSPEC_CADD270 "cadd270")
(UNSPEC_CDOT "cdot")
@@ -3639,9 +3739,15 @@ (define_int_attr optab [(UNSPEC_ANDF "and")
(UNSPEC_CMLA90 "cmla90")
(UNSPEC_CMLA180 "cmla180")
(UNSPEC_CMLA270 "cmla270")
+ (UNSPEC_EORQV "eorqv")
(UNSPEC_FADDV "plus")
+ (UNSPEC_FADDQV "faddqv")
+ (UNSPEC_FMAXQV "fmaxqv")
+ (UNSPEC_FMAXNMQV "fmaxnmqv")
(UNSPEC_FMAXNMV "smax")
(UNSPEC_FMAXV "smax_nan")
+ (UNSPEC_FMINQV "fminqv")
+ (UNSPEC_FMINNMQV "fminnmqv")
(UNSPEC_FMINNMV "smin")
(UNSPEC_FMINV "smin_nan")
(UNSPEC_SMUL_HIGHPART "smulh")
@@ -3657,11 +3763,16 @@ (define_int_attr optab [(UNSPEC_ANDF "and")
(UNSPEC_FTSSEL "ftssel")
(UNSPEC_LD1_COUNT "ld1")
(UNSPEC_LDNT1_COUNT "ldnt1")
+ (UNSPEC_ORQV "orqv")
(UNSPEC_PMULLB "pmullb")
(UNSPEC_PMULLB_PAIR "pmullb_pair")
(UNSPEC_PMULLT "pmullt")
(UNSPEC_PMULLT_PAIR "pmullt_pair")
(UNSPEC_SMATMUL "smatmul")
+ (UNSPEC_SMAXQV "smaxqv")
+ (UNSPEC_SMINQV "sminqv")
+ (UNSPEC_UMAXQV "umaxqv")
+ (UNSPEC_UMINQV "uminqv")
(UNSPEC_UZP "uzp")
(UNSPEC_UZPQ "uzpq")
(UNSPEC_ZIP "zip")
@@ -3955,12 +4066,16 @@ (define_int_attr pauth_hint_num [(UNSPEC_PACIASP "25")
(define_int_attr perm_insn [(UNSPEC_ZIP1 "zip1") (UNSPEC_ZIP2 "zip2")
(UNSPEC_ZIP1Q "zip1") (UNSPEC_ZIP2Q "zip2")
+ (UNSPEC_ZIPQ1 "zipq1") (UNSPEC_ZIPQ2 "zipq2")
(UNSPEC_TRN1 "trn1") (UNSPEC_TRN2 "trn2")
(UNSPEC_TRN1Q "trn1") (UNSPEC_TRN2Q "trn2")
(UNSPEC_UZP1 "uzp1") (UNSPEC_UZP2 "uzp2")
(UNSPEC_UZP1Q "uzp1") (UNSPEC_UZP2Q "uzp2")
+ (UNSPEC_UZPQ1 "uzpq1") (UNSPEC_UZPQ2 "uzpq2")
(UNSPEC_UZP "uzp") (UNSPEC_UZPQ "uzp")
- (UNSPEC_ZIP "zip") (UNSPEC_ZIPQ "zip")])
+ (UNSPEC_ZIP "zip") (UNSPEC_ZIPQ "zip")
+ (UNSPEC_TBL "tbl") (UNSPEC_TBLQ "tblq")
+ (UNSPEC_TBX "tbx") (UNSPEC_TBXQ "tbxq")])
; op code for REV instructions (size within which elements are reversed).
(define_int_attr rev_op [(UNSPEC_REV64 "64") (UNSPEC_REV32 "32")
new file mode 100644
@@ -0,0 +1,162 @@
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+sve2p1"
+
+typedef svint8_t fixed_uint8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint16_t fixed_uint16_t __attribute__((arm_sve_vector_bits(256)));
+typedef svint32_t fixed_int32_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint64_t fixed_uint64_t __attribute__((arm_sve_vector_bits(256)));
+
+/*
+** f1:
+** trn1 z0\.d, z0\.d, z0\.d
+** ret
+*/
+fixed_uint64_t
+f1 (fixed_uint64_t z0)
+{
+ return __builtin_shufflevector (z0, z0, 0, 0, 2, 2);
+}
+
+/*
+** f2:
+** trn2 z0\.d, z0\.d, z0\.d
+** ret
+*/
+fixed_uint64_t
+f2 (fixed_uint64_t z0)
+{
+ return __builtin_shufflevector (z0, z0, 1, 1, 3, 3);
+}
+
+/*
+** f3:
+** dupq z0\.s, z0\.s\[0\]
+** ret
+*/
+fixed_int32_t
+f3 (fixed_int32_t z0)
+{
+ return __builtin_shufflevector (z0, z0, 0, 0, 0, 0, 4, 4, 4, 4);
+}
+
+/*
+** f4:
+** dupq z0\.s, z0\.s\[1\]
+** ret
+*/
+fixed_int32_t
+f4 (fixed_int32_t z0)
+{
+ return __builtin_shufflevector (z0, z0, 1, 1, 1, 1, 5, 5, 5, 5);
+}
+
+/*
+** f5:
+** dupq z0\.s, z0\.s\[2\]
+** ret
+*/
+fixed_int32_t
+f5 (fixed_int32_t z0)
+{
+ return __builtin_shufflevector (z0, z0, 2, 2, 2, 2, 6, 6, 6, 6);
+}
+
+/*
+** f6:
+** dupq z0\.s, z0\.s\[3\]
+** ret
+*/
+fixed_int32_t
+f6 (fixed_int32_t z0)
+{
+ return __builtin_shufflevector (z0, z0, 3, 3, 3, 3, 7, 7, 7, 7);
+}
+
+/*
+** f7:
+** dupq z0\.h, z0\.h\[0\]
+** ret
+*/
+fixed_uint16_t
+f7 (fixed_uint16_t z0)
+{
+ return __builtin_shufflevector (z0, z0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 8, 8, 8, 8, 8, 8, 8, 8);
+}
+
+
+/*
+** f8:
+** dupq z0\.h, z0\.h\[5\]
+** ret
+*/
+fixed_uint16_t
+f8 (fixed_uint16_t z0)
+{
+ return __builtin_shufflevector (z0, z0,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 13, 13, 13, 13, 13, 13, 13, 13);
+}
+
+/*
+** f9:
+** dupq z0\.h, z0\.h\[7\]
+** ret
+*/
+fixed_uint16_t
+f9 (fixed_uint16_t z0)
+{
+ return __builtin_shufflevector (z0, z0,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 15, 15, 15, 15, 15, 15, 15, 15);
+}
+
+/*
+** f10:
+** dupq z0\.b, z0\.b\[0\]
+** ret
+*/
+fixed_uint8_t
+f10 (fixed_uint8_t z0)
+{
+ return __builtin_shufflevector (z0, z0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16);
+}
+
+/*
+** f11:
+** dupq z0\.b, z0\.b\[13\]
+** ret
+*/
+fixed_uint8_t
+f11 (fixed_uint8_t z0)
+{
+ return __builtin_shufflevector (z0, z0,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13,
+ 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29);
+}
+
+/*
+** f12:
+** dupq z0\.b, z0\.b\[15\]
+** ret
+*/
+fixed_uint8_t
+f12 (fixed_uint8_t z0)
+{
+ return __builtin_shufflevector (z0, z0,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31);
+}
new file mode 100644
@@ -0,0 +1,128 @@
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+sve2p1"
+
+typedef svint8_t fixed_int8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svfloat16_t fixed_float16_t __attribute__((arm_sve_vector_bits(256)));
+typedef svuint32_t fixed_uint32_t __attribute__((arm_sve_vector_bits(256)));
+typedef svfloat64_t fixed_float64_t __attribute__((arm_sve_vector_bits(256)));
+
+/*
+** f1:
+** extq z0\.b, z0\.b, z1\.b, #8
+** ret
+*/
+fixed_float64_t
+f1 (fixed_float64_t z0, fixed_float64_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 1, 4, 3, 6);
+}
+
+/*
+** f2:
+** extq z0\.b, z0\.b, z1\.b, #4
+** ret
+*/
+fixed_uint32_t
+f2 (fixed_uint32_t z0, fixed_uint32_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 1, 2, 3, 8, 5, 6, 7, 12);
+}
+
+/*
+** f3:
+** extq z0\.b, z0\.b, z1\.b, #12
+** ret
+*/
+fixed_uint32_t
+f3 (fixed_uint32_t z0, fixed_uint32_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 3, 8, 9, 10, 7, 12, 13, 14);
+}
+
+/*
+** f4:
+** extq z0\.b, z0\.b, z1\.b, #2
+** ret
+*/
+fixed_float16_t
+f4 (fixed_float16_t z0, fixed_float16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 1, 2, 3, 4, 5, 6, 7, 16,
+ 9, 10, 11, 12, 13, 14, 15, 24);
+}
+
+/*
+** f5:
+** extq z0\.b, z0\.b, z1\.b, #10
+** ret
+*/
+fixed_float16_t
+f5 (fixed_float16_t z0, fixed_float16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 5, 6, 7, 16, 17, 18, 19, 20,
+ 13, 14, 15, 24, 25, 26, 27, 28);
+}
+
+/*
+** f6:
+** extq z0\.b, z0\.b, z1\.b, #14
+** ret
+*/
+fixed_float16_t
+f6 (fixed_float16_t z0, fixed_float16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 7, 16, 17, 18, 19, 20, 21, 22,
+ 15, 24, 25, 26, 27, 28, 29, 30);
+}
+
+/*
+** f7:
+** extq z0\.b, z0\.b, z1\.b, #1
+** ret
+*/
+fixed_int8_t
+f7 (fixed_int8_t z0, fixed_int8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 32,
+ 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 48);
+}
+
+/*
+** f8:
+** extq z0\.b, z0\.b, z1\.b, #11
+** ret
+*/
+fixed_int8_t
+f8 (fixed_int8_t z0, fixed_int8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 11, 12, 13, 14, 15, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42,
+ 27, 28, 29, 30, 31, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58);
+}
+
+/*
+** f9:
+** extq z0\.b, z0\.b, z1\.b, #15
+** ret
+*/
+fixed_int8_t
+f9 (fixed_int8_t z0, fixed_int8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 15, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 31, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62);
+}
new file mode 100644
@@ -0,0 +1,111 @@
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+sve2p1"
+
+typedef svuint8_t fixed_uint8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svbfloat16_t fixed_bfloat16_t __attribute__((arm_sve_vector_bits(256)));
+typedef svfloat32_t fixed_float32_t __attribute__((arm_sve_vector_bits(256)));
+typedef svint64_t fixed_int64_t __attribute__((arm_sve_vector_bits(256)));
+
+/*
+** f1:
+** trn1 z0\.d, z0\.d, z1\.d
+** ret
+*/
+fixed_int64_t
+f1 (fixed_int64_t z0, fixed_int64_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 0, 4, 2, 6);
+}
+
+/*
+** f2:
+** trn2 z0\.d, z0\.d, z1\.d
+** ret
+*/
+fixed_int64_t
+f2 (fixed_int64_t z0, fixed_int64_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 1, 5, 3, 7);
+}
+
+/*
+** f3:
+** uzpq1 z0\.s, z0\.s, z1\.s
+** ret
+*/
+fixed_float32_t
+f3 (fixed_float32_t z0, fixed_float32_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 0, 2, 8, 10, 4, 6, 12, 14);
+}
+
+/*
+** f4:
+** uzpq2 z0\.s, z0\.s, z1\.s
+** ret
+*/
+fixed_float32_t
+f4 (fixed_float32_t z0, fixed_float32_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 1, 3, 9, 11, 5, 7, 13, 15);
+}
+
+/*
+** f5:
+** uzpq1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+fixed_bfloat16_t
+f5 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 0, 2, 4, 6, 16, 18, 20, 22,
+ 8, 10, 12, 14, 24, 26, 28, 30);
+}
+
+/*
+** f6:
+** uzpq2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+fixed_bfloat16_t
+f6 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 1, 3, 5, 7, 17, 19, 21, 23,
+ 9, 11, 13, 15, 25, 27, 29, 31);
+}
+
+/*
+** f7:
+** uzpq1 z0\.b, z0\.b, z1\.b
+** ret
+*/
+fixed_uint8_t
+f7 (fixed_uint8_t z0, fixed_uint8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+/*
+** f8:
+** uzpq2 z0\.b, z0\.b, z1\.b
+** ret
+*/
+fixed_uint8_t
+f8 (fixed_uint8_t z0, fixed_uint8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 1, 3, 5, 7, 9, 11, 13, 15,
+ 33, 35, 37, 39, 41, 43, 45, 47,
+ 17, 19, 21, 23, 25, 27, 29, 31,
+ 49, 51, 53, 55, 57, 59, 61, 63);
+}
new file mode 100644
@@ -0,0 +1,111 @@
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+sve2p1"
+
+typedef svuint8_t fixed_uint8_t __attribute__((arm_sve_vector_bits(256)));
+typedef svbfloat16_t fixed_bfloat16_t __attribute__((arm_sve_vector_bits(256)));
+typedef svfloat32_t fixed_float32_t __attribute__((arm_sve_vector_bits(256)));
+typedef svint64_t fixed_int64_t __attribute__((arm_sve_vector_bits(256)));
+
+/*
+** f1:
+** trn1 z0\.d, z0\.d, z1\.d
+** ret
+*/
+fixed_int64_t
+f1 (fixed_int64_t z0, fixed_int64_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 0, 4, 2, 6);
+}
+
+/*
+** f2:
+** trn2 z0\.d, z0\.d, z1\.d
+** ret
+*/
+fixed_int64_t
+f2 (fixed_int64_t z0, fixed_int64_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 1, 5, 3, 7);
+}
+
+/*
+** f3:
+** zipq1 z0\.s, z0\.s, z1\.s
+** ret
+*/
+fixed_float32_t
+f3 (fixed_float32_t z0, fixed_float32_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 0, 8, 1, 9, 4, 12, 5, 13);
+}
+
+/*
+** f4:
+** zipq2 z0\.s, z0\.s, z1\.s
+** ret
+*/
+fixed_float32_t
+f4 (fixed_float32_t z0, fixed_float32_t z1)
+{
+ return __builtin_shufflevector (z0, z1, 2, 10, 3, 11, 6, 14, 7, 15);
+}
+
+/*
+** f5:
+** zipq1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+fixed_bfloat16_t
+f5 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ 8, 24, 9, 25, 10, 26, 11, 27);
+}
+
+/*
+** f6:
+** zipq2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+fixed_bfloat16_t
+f6 (fixed_bfloat16_t z0, fixed_bfloat16_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 4, 20, 5, 21, 6, 22, 7, 23,
+ 12, 28, 13, 29, 14, 30, 15, 31);
+}
+
+/*
+** f7:
+** zipq1 z0\.b, z0\.b, z1\.b
+** ret
+*/
+fixed_uint8_t
+f7 (fixed_uint8_t z0, fixed_uint8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 0, 32, 1, 33, 2, 34, 3, 35,
+ 4, 36, 5, 37, 6, 38, 7, 39,
+ 16, 48, 17, 49, 18, 50, 19, 51,
+ 20, 52, 21, 53, 22, 54, 23, 55);
+}
+
+/*
+** f8:
+** zipq2 z0\.b, z0\.b, z1\.b
+** ret
+*/
+fixed_uint8_t
+f8 (fixed_uint8_t z0, fixed_uint8_t z1)
+{
+ return __builtin_shufflevector (z0, z1,
+ 8, 40, 9, 41, 10, 42, 11, 43,
+ 12, 44, 13, 45, 14, 46, 15, 47,
+ 24, 56, 25, 57, 26, 58, 27, 59,
+ 28, 60, 29, 61, 30, 62, 31, 63);
+}