@@ -1100,6 +1100,73 @@ public:
}
};
+
+/* Implements vst2 and vst4. */
+class vst24_impl : public full_width_access
+{
+public:
+ using full_width_access::full_width_access;
+
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_WRITE_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode;
+ switch (vectors_per_tuple ())
+ {
+ case 2:
+ icode = code_for_mve_vst2q (e.vector_mode (0));
+ break;
+
+ case 4:
+ icode = code_for_mve_vst4q (e.vector_mode (0));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ return e.use_contiguous_store_insn (icode);
+ }
+};
+
+/* Implements vld2 and vld4. */
+class vld24_impl : public full_width_access
+{
+public:
+ using full_width_access::full_width_access;
+
+ unsigned int
+ call_properties (const function_instance &) const override
+ {
+ return CP_READ_MEMORY;
+ }
+
+ rtx
+ expand (function_expander &e) const override
+ {
+ insn_code icode;
+ switch (vectors_per_tuple ())
+ {
+ case 2:
+ icode = code_for_mve_vld2q (e.vector_mode (0));
+ break;
+
+ case 4:
+ icode = code_for_mve_vld4q (e.vector_mode (0));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ return e.use_contiguous_load_insn (icode);
+ }
+};
+
} /* end anonymous namespace */
namespace arm_mve {
@@ -1326,6 +1393,8 @@ FUNCTION (vfmsq, unspec_mve_function_exact_insn, (-1, -1, VFMSQ_F, -1, -1, -1, -
FUNCTION_WITH_M_N_NO_F (vhaddq, VHADDQ)
FUNCTION_WITH_M_N_NO_F (vhsubq, VHSUBQ)
FUNCTION (vld1q, vld1_impl,)
+FUNCTION (vld2q, vld24_impl, (2))
+FUNCTION (vld4q, vld24_impl, (4))
FUNCTION (vldrbq, vldrq_impl, (TYPE_SUFFIX_s8, TYPE_SUFFIX_u8))
FUNCTION (vldrbq_gather, vldrq_gather_impl, (false, TYPE_SUFFIX_s8, TYPE_SUFFIX_u8))
FUNCTION (vldrdq_gather, vldrq_gather_impl, (false, TYPE_SUFFIX_s64, TYPE_SUFFIX_u64, NUM_TYPE_SUFFIXES))
@@ -1458,6 +1527,8 @@ FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ)
FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ)
FUNCTION_ONLY_N_NO_F (vsriq, VSRIQ)
FUNCTION (vst1q, vst1_impl,)
+FUNCTION (vst2q, vst24_impl, (2))
+FUNCTION (vst4q, vst24_impl, (4))
FUNCTION (vstrbq, vstrq_impl, (QImode, opt_scalar_mode ()))
FUNCTION (vstrbq_scatter, vstrq_scatter_impl, (false, QImode, opt_scalar_mode ()))
FUNCTION (vstrdq_scatter, vstrq_scatter_impl, (false, DImode, opt_scalar_mode ()))
@@ -59,6 +59,8 @@ DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
DEF_MVE_FUNCTION (viwdupq, vidwdup, all_unsigned, mx_or_none)
DEF_MVE_FUNCTION (vld1q, load, all_integer, z_or_none)
+DEF_MVE_FUNCTION (vld2q, load, all_integer, none)
+DEF_MVE_FUNCTION (vld4q, load, all_integer, none)
DEF_MVE_FUNCTION (vldrbq, load_ext, all_integer, z_or_none)
DEF_MVE_FUNCTION (vldrbq_gather, load_ext_gather_offset, all_integer, z_or_none)
DEF_MVE_FUNCTION (vldrdq_gather, load_ext_gather_offset, integer_64, z_or_none)
@@ -179,6 +181,8 @@ DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, mx_or_none)
DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none)
DEF_MVE_FUNCTION (vsriq, ternary_rshift, all_integer, m_or_none)
DEF_MVE_FUNCTION (vst1q, store, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vst2q, store, all_integer, none)
+DEF_MVE_FUNCTION (vst4q, store, all_integer, none)
DEF_MVE_FUNCTION (vstrbq, store, all_integer, p_or_none)
DEF_MVE_FUNCTION (vstrbq_scatter, store_scatter_offset, all_integer, p_or_none)
DEF_MVE_FUNCTION (vstrhq, store, integer_16_32, p_or_none)
@@ -234,6 +238,8 @@ DEF_MVE_FUNCTION (vfmaq, ternary_opt_n, all_float, m_or_none)
DEF_MVE_FUNCTION (vfmasq, ternary_n, all_float, m_or_none)
DEF_MVE_FUNCTION (vfmsq, ternary, all_float, m_or_none)
DEF_MVE_FUNCTION (vld1q, load, all_float, z_or_none)
+DEF_MVE_FUNCTION (vld2q, load, all_float, none)
+DEF_MVE_FUNCTION (vld4q, load, all_float, none)
DEF_MVE_FUNCTION (vldrhq, load_ext, float_16, z_or_none)
DEF_MVE_FUNCTION (vldrhq_gather, load_ext_gather_offset, float_16, z_or_none)
DEF_MVE_FUNCTION (vldrhq_gather_shifted, load_ext_gather_offset, float_16, z_or_none)
@@ -264,6 +270,8 @@ DEF_MVE_FUNCTION (vrndpq, unary, all_float, mx_or_none)
DEF_MVE_FUNCTION (vrndq, unary, all_float, mx_or_none)
DEF_MVE_FUNCTION (vrndxq, unary, all_float, mx_or_none)
DEF_MVE_FUNCTION (vst1q, store, all_float, p_or_none)
+DEF_MVE_FUNCTION (vst2q, store, all_float, none)
+DEF_MVE_FUNCTION (vst4q, store, all_float, none)
DEF_MVE_FUNCTION (vstrhq, store, float_16, p_or_none)
DEF_MVE_FUNCTION (vstrhq_scatter, store_scatter_offset, float_16, p_or_none)
DEF_MVE_FUNCTION (vstrhq_scatter_shifted, store_scatter_offset, float_16, p_or_none)
@@ -82,6 +82,8 @@ extern const function_base *const vhsubq;
extern const function_base *const vidupq;
extern const function_base *const viwdupq;
extern const function_base *const vld1q;
+extern const function_base *const vld2q;
+extern const function_base *const vld4q;
extern const function_base *const vldrbq;
extern const function_base *const vldrbq_gather;
extern const function_base *const vldrdq_gather;
@@ -214,6 +216,8 @@ extern const function_base *const vshrq;
extern const function_base *const vsliq;
extern const function_base *const vsriq;
extern const function_base *const vst1q;
+extern const function_base *const vst2q;
+extern const function_base *const vst4q;
extern const function_base *const vstrbq;
extern const function_base *const vstrbq_scatter;
extern const function_base *const vstrdq_scatter;
@@ -535,11 +535,13 @@ register_builtin_tuple_types (vector_type_index type)
tree vectype = acle_vector_types[0][type];
tree arrtype = build_array_type_nelts (vectype, num_vectors);
- gcc_assert (TYPE_MODE_RAW (arrtype) == TYPE_MODE (arrtype)
+ gcc_assert (VECTOR_MODE_P (TYPE_MODE (arrtype))
+ && TYPE_MODE_RAW (arrtype) == TYPE_MODE (arrtype)
&& TYPE_ALIGN (arrtype) == 64);
tree tuple_type = wrap_type_in_struct (arrtype);
- gcc_assert (TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
+ gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
+ && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
&& TYPE_ALIGN (tuple_type) == 64);
register_type_decl (tuple_type, buffer);
@@ -278,6 +278,7 @@ static rtx_insn *arm_pic_static_addr (rtx orig, rtx reg);
static bool cortex_a9_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *);
static bool xscale_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *);
static bool fa726te_sched_adjust_cost (rtx_insn *, int, rtx_insn *, int *);
+static opt_machine_mode arm_array_mode (machine_mode, unsigned HOST_WIDE_INT);
static bool arm_array_mode_supported_p (machine_mode,
unsigned HOST_WIDE_INT);
static machine_mode arm_preferred_simd_mode (scalar_mode);
@@ -515,6 +516,8 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
#define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
+#undef TARGET_ARRAY_MODE
+#define TARGET_ARRAY_MODE arm_array_mode
#undef TARGET_ARRAY_MODE_SUPPORTED_P
#define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
@@ -20774,7 +20777,9 @@ output_move_neon (rtx *operands)
|| NEON_REGNO_OK_FOR_QUAD (regno));
gcc_assert (VALID_NEON_DREG_MODE (mode)
|| VALID_NEON_QREG_MODE (mode)
- || VALID_NEON_STRUCT_MODE (mode));
+ || VALID_NEON_STRUCT_MODE (mode)
+ || (TARGET_HAVE_MVE
+ && VALID_MVE_STRUCT_MODE (mode)));
gcc_assert (MEM_P (mem));
addr = XEXP (mem, 0);
@@ -24949,7 +24954,8 @@ arm_print_operand_address (FILE *stream, machine_mode mode, rtx x)
REGNO (XEXP (x, 0)),
GET_CODE (x) == PRE_DEC ? "-" : "",
GET_MODE_SIZE (mode));
- else if (TARGET_HAVE_MVE && (mode == OImode || mode == XImode))
+ else if (TARGET_HAVE_MVE
+ && VALID_MVE_STRUCT_MODE (mode))
asm_fprintf (stream, "[%r]!", REGNO (XEXP (x,0)));
else
asm_fprintf (stream, "[%r], #%s%d", REGNO (XEXP (x, 0)),
@@ -25839,7 +25845,17 @@ arm_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
if (TARGET_HAVE_MVE)
return ((VALID_MVE_MODE (mode) && NEON_REGNO_OK_FOR_QUAD (regno))
|| (mode == OImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
- || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8)));
+ || (mode == V2x16QImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+ || (mode == V2x8HImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+ || (mode == V2x4SImode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+ || (mode == V2x8HFmode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+ || (mode == V2x4SFmode && NEON_REGNO_OK_FOR_NREGS (regno, 4))
+ || (mode == XImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+ || (mode == V4x16QImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+ || (mode == V4x8HImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+ || (mode == V4x4SImode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+ || (mode == V4x8HFmode && NEON_REGNO_OK_FOR_NREGS (regno, 8))
+ || (mode == V4x4SFmode && NEON_REGNO_OK_FOR_NREGS (regno, 8)));
return false;
}
@@ -29785,6 +29801,27 @@ arm_vector_mode_supported_p (machine_mode mode)
return false;
}
+/* Implements target hook array_mode. */
+static opt_machine_mode
+arm_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
+{
+ if (TARGET_HAVE_MVE
+ /* MVE accepts only tuples of 2 or 4 vectors. */
+ && (nelems == 2
+ || nelems == 4))
+ {
+ machine_mode struct_mode;
+ FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
+ {
+ if (GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
+ && known_eq (GET_MODE_NUNITS (struct_mode),
+ GET_MODE_NUNITS (mode) * nelems))
+ return struct_mode;
+ }
+ }
+ return opt_machine_mode ();
+}
+
/* Implements target hook array_mode_supported_p. */
static bool
@@ -1127,8 +1127,17 @@ extern const int arm_arch_cde_coproc_bits[];
((MODE) == TImode || (MODE) == EImode || (MODE) == OImode \
|| (MODE) == CImode || (MODE) == XImode)
-#define VALID_MVE_STRUCT_MODE(MODE) \
- ((MODE) == TImode || (MODE) == OImode || (MODE) == XImode)
+#define VALID_MVE_STRUCT_MODE(MODE) \
+ ((MODE) == V2x16QImode \
+ || (MODE) == V2x8HImode \
+ || (MODE) == V2x4SImode \
+ || (MODE) == V2x8HFmode \
+ || (MODE) == V2x4SFmode \
+ || (MODE) == V4x16QImode \
+ || (MODE) == V4x8HImode \
+ || (MODE) == V4x4SImode \
+ || (MODE) == V4x8HFmode \
+ || (MODE) == V4x4SFmode)
/* The conditions under which vector modes are supported for general
arithmetic using Neon. */
@@ -45,23 +45,11 @@
#endif
#ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE
-#define vst4q(__addr, __value) __arm_vst4q(__addr, __value)
#define vuninitializedq(__v) __arm_vuninitializedq(__v)
-#define vst2q(__addr, __value) __arm_vst2q(__addr, __value)
-#define vld2q(__addr) __arm_vld2q(__addr)
-#define vld4q(__addr) __arm_vld4q(__addr)
#define vsetq_lane(__a, __b, __idx) __arm_vsetq_lane(__a, __b, __idx)
#define vgetq_lane(__a, __idx) __arm_vgetq_lane(__a, __idx)
-#define vst4q_s8( __addr, __value) __arm_vst4q_s8( __addr, __value)
-#define vst4q_s16( __addr, __value) __arm_vst4q_s16( __addr, __value)
-#define vst4q_s32( __addr, __value) __arm_vst4q_s32( __addr, __value)
-#define vst4q_u8( __addr, __value) __arm_vst4q_u8( __addr, __value)
-#define vst4q_u16( __addr, __value) __arm_vst4q_u16( __addr, __value)
-#define vst4q_u32( __addr, __value) __arm_vst4q_u32( __addr, __value)
-#define vst4q_f16( __addr, __value) __arm_vst4q_f16( __addr, __value)
-#define vst4q_f32( __addr, __value) __arm_vst4q_f32( __addr, __value)
#define vpnot(__a) __arm_vpnot(__a)
#define vuninitializedq_u8(void) __arm_vuninitializedq_u8(void)
#define vuninitializedq_u16(void) __arm_vuninitializedq_u16(void)
@@ -73,30 +61,6 @@
#define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void)
#define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void)
#define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void)
-#define vst2q_s8(__addr, __value) __arm_vst2q_s8(__addr, __value)
-#define vst2q_u8(__addr, __value) __arm_vst2q_u8(__addr, __value)
-#define vld2q_s8(__addr) __arm_vld2q_s8(__addr)
-#define vld2q_u8(__addr) __arm_vld2q_u8(__addr)
-#define vld4q_s8(__addr) __arm_vld4q_s8(__addr)
-#define vld4q_u8(__addr) __arm_vld4q_u8(__addr)
-#define vst2q_s16(__addr, __value) __arm_vst2q_s16(__addr, __value)
-#define vst2q_u16(__addr, __value) __arm_vst2q_u16(__addr, __value)
-#define vld2q_s16(__addr) __arm_vld2q_s16(__addr)
-#define vld2q_u16(__addr) __arm_vld2q_u16(__addr)
-#define vld4q_s16(__addr) __arm_vld4q_s16(__addr)
-#define vld4q_u16(__addr) __arm_vld4q_u16(__addr)
-#define vst2q_s32(__addr, __value) __arm_vst2q_s32(__addr, __value)
-#define vst2q_u32(__addr, __value) __arm_vst2q_u32(__addr, __value)
-#define vld2q_s32(__addr) __arm_vld2q_s32(__addr)
-#define vld2q_u32(__addr) __arm_vld2q_u32(__addr)
-#define vld4q_s32(__addr) __arm_vld4q_s32(__addr)
-#define vld4q_u32(__addr) __arm_vld4q_u32(__addr)
-#define vld4q_f16(__addr) __arm_vld4q_f16(__addr)
-#define vld2q_f16(__addr) __arm_vld2q_f16(__addr)
-#define vst2q_f16(__addr, __value) __arm_vst2q_f16(__addr, __value)
-#define vld4q_f32(__addr) __arm_vld4q_f32(__addr)
-#define vld2q_f32(__addr) __arm_vld2q_f32(__addr)
-#define vst2q_f32(__addr, __value) __arm_vst2q_f32(__addr, __value)
#define vsetq_lane_f16(__a, __b, __idx) __arm_vsetq_lane_f16(__a, __b, __idx)
#define vsetq_lane_f32(__a, __b, __idx) __arm_vsetq_lane_f32(__a, __b, __idx)
#define vsetq_lane_s16(__a, __b, __idx) __arm_vsetq_lane_s16(__a, __b, __idx)
@@ -147,60 +111,6 @@
__builtin_arm_lane_check (__ARM_NUM_LANES(__vec), \
__ARM_LANEQ(__vec, __idx))
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_s8 (int8_t * __addr, int8x16x4_t __value)
-{
- union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_s16 (int16_t * __addr, int16x8x4_t __value)
-{
- union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_s32 (int32_t * __addr, int32x4x4_t __value)
-{
- union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_u8 (uint8_t * __addr, uint8x16x4_t __value)
-{
- union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_u16 (uint16_t * __addr, uint16x8x4_t __value)
-{
- union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value)
-{
- union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
__extension__ extern __inline mve_pred16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vpnot (mve_pred16_t __a)
@@ -208,168 +118,6 @@ __arm_vpnot (mve_pred16_t __a)
return __builtin_mve_vpnotv16bi (__a);
}
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_s8 (int8_t * __addr, int8x16x2_t __value)
-{
- union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_u8 (uint8_t * __addr, uint8x16x2_t __value)
-{
- union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline int8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_s8 (int8_t const * __addr)
-{
- union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv16qi ((__builtin_neon_qi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline uint8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_u8 (uint8_t const * __addr)
-{
- union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv16qi ((__builtin_neon_qi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline int8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_s8 (int8_t const * __addr)
-{
- union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv16qi ((__builtin_neon_qi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline uint8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_u8 (uint8_t const * __addr)
-{
- union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv16qi ((__builtin_neon_qi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_s16 (int16_t * __addr, int16x8x2_t __value)
-{
- union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_u16 (uint16_t * __addr, uint16x8x2_t __value)
-{
- union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline int16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_s16 (int16_t const * __addr)
-{
- union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv8hi ((__builtin_neon_hi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline uint16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_u16 (uint16_t const * __addr)
-{
- union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv8hi ((__builtin_neon_hi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline int16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_s16 (int16_t const * __addr)
-{
- union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv8hi ((__builtin_neon_hi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline uint16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_u16 (uint16_t const * __addr)
-{
- union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv8hi ((__builtin_neon_hi *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_s32 (int32_t * __addr, int32x4x2_t __value)
-{
- union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_u32 (uint32_t * __addr, uint32x4x2_t __value)
-{
- union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o);
-}
-
-__extension__ extern __inline int32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_s32 (int32_t const * __addr)
-{
- union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv4si ((__builtin_neon_si *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline uint32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_u32 (uint32_t const * __addr)
-{
- union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv4si ((__builtin_neon_si *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline int32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_s32 (int32_t const * __addr)
-{
- union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv4si ((__builtin_neon_si *) __addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline uint32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_u32 (uint32_t const * __addr)
-{
- union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv4si ((__builtin_neon_si *) __addr);
- return __rv.__i;
-}
-
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __idx)
@@ -620,78 +368,6 @@ __arm_srshr (int32_t value, const int shift)
#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_f16 (float16_t * __addr, float16x8x4_t __value)
-{
- union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv8hf (__addr, __rv.__o);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value)
-{
- union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst4qv4sf (__addr, __rv.__o);
-}
-
-__extension__ extern __inline float16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_f16 (float16_t const * __addr)
-{
- union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv8hf (__addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline float16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_f16 (float16_t const * __addr)
-{
- union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv8hf (__addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_f16 (float16_t * __addr, float16x8x2_t __value)
-{
- union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv8hf (__addr, __rv.__o);
-}
-
-__extension__ extern __inline float32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q_f32 (float32_t const * __addr)
-{
- union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
- __rv.__o = __builtin_mve_vld4qv4sf (__addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline float32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q_f32 (float32_t const * __addr)
-{
- union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__o = __builtin_mve_vld2qv4sf (__addr);
- return __rv.__i;
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q_f32 (float32_t * __addr, float32x4x2_t __value)
-{
- union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
- __rv.__i = __value;
- __builtin_mve_vst2qv4sf (__addr, __rv.__o);
-}
-
__extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsetq_lane_f16 (float16_t __a, float16x8_t __b, const int __idx)
@@ -728,173 +404,6 @@ __arm_vgetq_lane_f32 (float32x4_t __a, const int __idx)
#endif
#ifdef __cplusplus
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (int8_t * __addr, int8x16x4_t __value)
-{
- __arm_vst4q_s8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (int16_t * __addr, int16x8x4_t __value)
-{
- __arm_vst4q_s16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (int32_t * __addr, int32x4x4_t __value)
-{
- __arm_vst4q_s32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (uint8_t * __addr, uint8x16x4_t __value)
-{
- __arm_vst4q_u8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (uint16_t * __addr, uint16x8x4_t __value)
-{
- __arm_vst4q_u16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (uint32_t * __addr, uint32x4x4_t __value)
-{
- __arm_vst4q_u32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (int8_t * __addr, int8x16x2_t __value)
-{
- __arm_vst2q_s8 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (uint8_t * __addr, uint8x16x2_t __value)
-{
- __arm_vst2q_u8 (__addr, __value);
-}
-
-__extension__ extern __inline int8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (int8_t const * __addr)
-{
- return __arm_vld2q_s8 (__addr);
-}
-
-__extension__ extern __inline uint8x16x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (uint8_t const * __addr)
-{
- return __arm_vld2q_u8 (__addr);
-}
-
-__extension__ extern __inline int8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (int8_t const * __addr)
-{
- return __arm_vld4q_s8 (__addr);
-}
-
-__extension__ extern __inline uint8x16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (uint8_t const * __addr)
-{
- return __arm_vld4q_u8 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (int16_t * __addr, int16x8x2_t __value)
-{
- __arm_vst2q_s16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (uint16_t * __addr, uint16x8x2_t __value)
-{
- __arm_vst2q_u16 (__addr, __value);
-}
-
-__extension__ extern __inline int16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (int16_t const * __addr)
-{
- return __arm_vld2q_s16 (__addr);
-}
-
-__extension__ extern __inline uint16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (uint16_t const * __addr)
-{
- return __arm_vld2q_u16 (__addr);
-}
-
-__extension__ extern __inline int16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (int16_t const * __addr)
-{
- return __arm_vld4q_s16 (__addr);
-}
-
-__extension__ extern __inline uint16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (uint16_t const * __addr)
-{
- return __arm_vld4q_u16 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (int32_t * __addr, int32x4x2_t __value)
-{
- __arm_vst2q_s32 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (uint32_t * __addr, uint32x4x2_t __value)
-{
- __arm_vst2q_u32 (__addr, __value);
-}
-
-__extension__ extern __inline int32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (int32_t const * __addr)
-{
- return __arm_vld2q_s32 (__addr);
-}
-
-__extension__ extern __inline uint32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (uint32_t const * __addr)
-{
- return __arm_vld2q_u32 (__addr);
-}
-
-__extension__ extern __inline int32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (int32_t const * __addr)
-{
- return __arm_vld4q_s32 (__addr);
-}
-
-__extension__ extern __inline uint32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (uint32_t const * __addr)
-{
- return __arm_vld4q_u32 (__addr);
-}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -1010,62 +519,6 @@ __arm_vgetq_lane (uint64x2_t __a, const int __idx)
#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (float16_t * __addr, float16x8x4_t __value)
-{
- __arm_vst4q_f16 (__addr, __value);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst4q (float32_t * __addr, float32x4x4_t __value)
-{
- __arm_vst4q_f32 (__addr, __value);
-}
-
-__extension__ extern __inline float16x8x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (float16_t const * __addr)
-{
- return __arm_vld4q_f16 (__addr);
-}
-
-__extension__ extern __inline float16x8x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (float16_t const * __addr)
-{
- return __arm_vld2q_f16 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (float16_t * __addr, float16x8x2_t __value)
-{
- __arm_vst2q_f16 (__addr, __value);
-}
-
-__extension__ extern __inline float32x4x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld4q (float32_t const * __addr)
-{
- return __arm_vld4q_f32 (__addr);
-}
-
-__extension__ extern __inline float32x4x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vld2q (float32_t const * __addr)
-{
- return __arm_vld2q_f32 (__addr);
-}
-
-__extension__ extern __inline void
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vst2q (float32_t * __addr, float32x4x2_t __value)
-{
- __arm_vst2q_f32 (__addr, __value);
-}
-
__extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsetq_lane (float16_t __a, float16x8_t __b, const int __idx)
@@ -1405,51 +858,6 @@ extern void *__ARM_undef;
#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */
-#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \
- __typeof(p1) __p1 = (p1); \
- _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce_s8_ptr(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \
- int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce_s16_ptr(__p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \
- int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce_s32_ptr(__p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \
- int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce_u8_ptr(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \
- int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(__p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \
- int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)), \
- int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: __arm_vst4q_f16 (__ARM_mve_coerce_f16_ptr(__p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x4_t)), \
- int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: __arm_vst4q_f32 (__ARM_mve_coerce_f32_ptr(__p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x4_t)));})
-
-#define __arm_vld2q(p0) ( \
- _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
- int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
- int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
- int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
- int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
- int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \
- int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \
- int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *))))
-
-#define __arm_vld4q(p0) ( \
- _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
- int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
- int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
- int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
- int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
- int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *)), \
- int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld4q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *)), \
- int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld4q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *))))
-
-#define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
- _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: __arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x2_t)), \
- int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x2_t]: __arm_vst2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x2_t)), \
- int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x2_t]: __arm_vst2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x2_t)), \
- int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x2_t]: __arm_vst2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x2_t)), \
- int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: __arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x2_t)), \
- int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: __arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x2_t)), \
- int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x2_t]: __arm_vst2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x2_t)), \
- int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x2_t]: __arm_vst2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x2_t)));})
-
#define __arm_vuninitializedq(p0) ({ __typeof(p0) __p0 = (p0); \
_Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
int (*)[__ARM_mve_type_int8x16_t]: __arm_vuninitializedq_s8 (), \
@@ -1492,25 +900,6 @@ extern void *__ARM_undef;
#else /* MVE Integer. */
-#define __arm_vst4q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
- _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \
- int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \
- int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \
- int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \
- int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \
- int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)));})
-
-#define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \
- _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: __arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x2_t)), \
- int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x2_t]: __arm_vst2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x2_t)), \
- int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x2_t]: __arm_vst2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x2_t)), \
- int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x2_t]: __arm_vst2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x2_t)), \
- int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: __arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x2_t)), \
- int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: __arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x2_t)));})
-
-
#define __arm_vuninitializedq(p0) ({ __typeof(p0) __p0 = (p0); \
_Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
int (*)[__ARM_mve_type_int8x16_t]: __arm_vuninitializedq_s8 (), \
@@ -1522,23 +911,6 @@ extern void *__ARM_undef;
int (*)[__ARM_mve_type_uint32x4_t]: __arm_vuninitializedq_u32 (), \
int (*)[__ARM_mve_type_uint64x2_t]: __arm_vuninitializedq_u64 ());})
-#define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
- int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
- int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld2q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
- int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld2q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
- int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
- int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *))))
-
-
-#define __arm_vld4q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \
- int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld4q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \
- int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld4q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \
- int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld4q_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *)), \
- int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld4q_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *)), \
- int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *)), \
- int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *))))
-
#define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
_Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1), \
@@ -18,7 +18,6 @@
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
-VAR5 (STORE1, vst4q, v16qi, v8hi, v4si, v8hf, v4sf)
VAR2 (UNOP_NONE_NONE, vrndxq_f, v8hf, v4sf)
VAR2 (UNOP_NONE_NONE, vrndq_f, v8hf, v4sf)
VAR2 (UNOP_NONE_NONE, vrndpq_f, v8hf, v4sf)
@@ -679,9 +678,6 @@ VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbciq_m_s, v4si)
VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbciq_m_u, v4si)
VAR1 (QUADOP_NONE_NONE_NONE_NONE_PRED, vsbcq_m_s, v4si)
VAR1 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vsbcq_m_u, v4si)
-VAR5 (STORE1, vst2q, v16qi, v8hi, v4si, v8hf, v4sf)
-VAR5 (LOAD1, vld4q, v16qi, v8hi, v4si, v8hf, v4sf)
-VAR5 (LOAD1, vld2q, v16qi, v8hi, v4si, v8hf, v4sf)
VAR1 (ASRL, sqrshr_,si)
VAR1 (ASRL, sqrshrl_sat64_,di)
VAR1 (ASRL, sqrshrl_sat48_,di)
@@ -139,7 +139,18 @@ (define_mode_iterator VQXMOV [V16QI V8HI V8HF V8BF V4SI V4SF V2DI TI])
;; Opaque structure types wider than TImode.
(define_mode_iterator VSTRUCT [(EI "!TARGET_HAVE_MVE") OI
- (CI "!TARGET_HAVE_MVE") XI])
+ (CI "!TARGET_HAVE_MVE") XI
+ (V2x16QI "TARGET_HAVE_MVE")
+ (V2x8HI "TARGET_HAVE_MVE")
+ (V2x4SI "TARGET_HAVE_MVE")
+ (V2x8HF "TARGET_HAVE_MVE_FLOAT")
+ (V2x4SF "TARGET_HAVE_MVE_FLOAT")
+ (V4x16QI "TARGET_HAVE_MVE")
+ (V4x8HI "TARGET_HAVE_MVE")
+ (V4x4SI "TARGET_HAVE_MVE")
+ (V4x8HF "TARGET_HAVE_MVE_FLOAT")
+ (V4x4SF "TARGET_HAVE_MVE_FLOAT")
+ ])
;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
(define_mode_iterator VTAB [TI EI OI])
@@ -286,6 +297,29 @@ (define_mode_iterator MVE_7_HI [HI V16BI V8BI V4BI V2QI])
(define_mode_iterator MVE_V8HF [V8HF])
(define_mode_iterator MVE_V16QI [V16QI])
+(define_mode_attr MVE_VLD2_VST2 [(V16QI "V2x16QI")
+ (V8HI "V2x8HI")
+ (V4SI "V2x4SI")
+ (V8HF "V2x8HF")
+ (V4SF "V2x4SF")])
+(define_mode_attr MVE_vld2_vst2 [(V16QI "v2x16qi")
+ (V8HI "v2x8hi")
+ (V4SI "v2x4si")
+ (V8HF "v2x8hf")
+ (V4SF "v2x4sf")])
+
+(define_mode_attr MVE_VLD4_VST4 [(V16QI "V4x16QI")
+ (V8HI "V4x8HI")
+ (V4SI "V4x4SI")
+ (V8HF "V4x8HF")
+ (V4SF "V4x4SF")])
+
+(define_mode_attr MVE_vld4_vst4 [(V16QI "v4x16qi")
+ (V8HI "v4x8hi")
+ (V4SI "v4x4si")
+ (V8HF "v4x8hf")
+ (V4SF "v4x4sf")])
+
;; Types for MVE truncating stores and widening loads
(define_mode_iterator MVE_w_narrow_TYPE [V8QI V4QI V4HI])
(define_mode_attr MVE_w_narrow_type [(V8QI "v8qi") (V4QI "v4qi") (V4HI "v4hi")])
@@ -110,13 +110,14 @@ (define_insn "@mve_vdupq_n<mode>"
;;
;; [vst4q])
;;
-(define_insn "mve_vst4q<mode>"
- [(set (match_operand:XI 0 "mve_struct_operand" "=Ug")
- (unspec:XI [(match_operand:XI 1 "s_register_operand" "w")
- (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vst4q<mode>"
+ [(set (match_operand:<MVE_VLD4_VST4> 0 "mve_struct_operand" "=Ug")
+ (unspec:<MVE_VLD4_VST4>
+ [(match_operand:<MVE_VLD4_VST4> 1 "s_register_operand" "w")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
VST4Q))
]
- "TARGET_HAVE_MVE"
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
{
rtx ops[6];
int regno = REGNO (operands[1]);
@@ -4061,14 +4062,14 @@ (define_insn "@mve_<mve_insn>q_m_<supf>v4si"
;;
;; [vst2q])
;;
-(define_insn "mve_vst2q<mode>"
- [(set (match_operand:OI 0 "mve_struct_operand" "=Ug")
- (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
- (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vst2q<mode>"
+ [(set (match_operand:<MVE_VLD2_VST2> 0 "mve_struct_operand" "=Ug")
+ (unspec:<MVE_VLD2_VST2>
+ [(match_operand:<MVE_VLD2_VST2> 1 "s_register_operand" "w")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
VST2Q))
]
- "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
- || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))"
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
{
rtx ops[4];
int regno = REGNO (operands[1]);
@@ -4089,14 +4090,14 @@ (define_insn "mve_vst2q<mode>"
;;
;; [vld2q])
;;
-(define_insn "mve_vld2q<mode>"
- [(set (match_operand:OI 0 "s_register_operand" "=w")
- (unspec:OI [(match_operand:OI 1 "mve_struct_operand" "Ug")
- (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vld2q<mode>"
+ [(set (match_operand:<MVE_VLD2_VST2> 0 "s_register_operand" "=w")
+ (unspec:<MVE_VLD2_VST2>
+ [(match_operand:<MVE_VLD2_VST2> 1 "mve_struct_operand" "Ug")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
VLD2Q))
]
- "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
- || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))"
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
{
rtx ops[4];
int regno = REGNO (operands[0]);
@@ -4117,14 +4118,14 @@ (define_insn "mve_vld2q<mode>"
;;
;; [vld4q])
;;
-(define_insn "mve_vld4q<mode>"
- [(set (match_operand:XI 0 "s_register_operand" "=w")
- (unspec:XI [(match_operand:XI 1 "mve_struct_operand" "Ug")
- (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+(define_insn "@mve_vld4q<mode>"
+ [(set (match_operand:<MVE_VLD4_VST4> 0 "s_register_operand" "=w")
+ (unspec:<MVE_VLD4_VST4>
+ [(match_operand:<MVE_VLD4_VST4> 1 "mve_struct_operand" "Ug")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
VLD4Q))
]
- "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
- || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))"
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
{
rtx ops[6];
int regno = REGNO (operands[0]);
@@ -492,12 +492,21 @@ (define_expand "vec_load_lanesoi<mode>"
(unspec:OI [(match_operand:OI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))]
- "TARGET_NEON || TARGET_HAVE_MVE"
+ "TARGET_NEON"
{
- if (TARGET_NEON)
- emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
- else
- emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
+ emit_insn (gen_neon_vld2<mode> (operands[0], operands[1]));
+ DONE;
+})
+
+;;; On MVE we use V2xYYY modes instead of OI
+(define_expand "vec_load_lanes<MVE_vld2_vst2><mode>"
+ [(set (match_operand:<MVE_VLD2_VST2> 0 "s_register_operand")
+ (unspec:<MVE_VLD2_VST2> [(match_operand:<MVE_VLD2_VST2> 1 "neon_struct_operand")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD2))]
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
+{
+ emit_insn (gen_mve_vld2q<mode> (operands[0], operands[1]));
DONE;
})
@@ -506,12 +515,21 @@ (define_expand "vec_store_lanesoi<mode>"
(unspec:OI [(match_operand:OI 1 "s_register_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VST2))]
- "TARGET_NEON || TARGET_HAVE_MVE"
+ "TARGET_NEON"
{
- if (TARGET_NEON)
- emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
- else
- emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
+ emit_insn (gen_neon_vst2<mode> (operands[0], operands[1]));
+ DONE;
+})
+
+;;; On MVE we use V2xYYY modes instead of OI
+(define_expand "vec_store_lanes<MVE_vld2_vst2><mode>"
+ [(set (match_operand:<MVE_VLD2_VST2> 0 "neon_struct_operand")
+ (unspec:<MVE_VLD2_VST2> [(match_operand:<MVE_VLD2_VST2> 1 "s_register_operand")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST2))]
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD2_VST2>mode))"
+{
+ emit_insn (gen_mve_vst2q<mode> (operands[0], operands[1]));
DONE;
})
@@ -519,12 +537,21 @@ (define_expand "vec_load_lanesxi<mode>"
[(match_operand:XI 0 "s_register_operand")
(match_operand:XI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
- "TARGET_NEON || TARGET_HAVE_MVE"
+ "TARGET_NEON"
{
- if (TARGET_NEON)
- emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
- else
- emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
+ emit_insn (gen_neon_vld4<mode> (operands[0], operands[1]));
+ DONE;
+})
+
+;;; On MVE we use V4xYYY modes instead of XI
+(define_expand "vec_load_lanes<MVE_vld4_vst4><mode>"
+ [(set (match_operand:<MVE_VLD4_VST4> 0 "s_register_operand")
+ (unspec:<MVE_VLD4_VST4> [(match_operand:<MVE_VLD4_VST4> 1 "neon_struct_operand")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4))]
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
+{
+ emit_insn (gen_mve_vld4q<mode> (operands[0], operands[1]));
DONE;
})
@@ -532,12 +559,21 @@ (define_expand "vec_store_lanesxi<mode>"
[(match_operand:XI 0 "neon_struct_operand")
(match_operand:XI 1 "s_register_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
- "TARGET_NEON || TARGET_HAVE_MVE"
+ "TARGET_NEON"
{
- if (TARGET_NEON)
- emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
- else
- emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
+ emit_insn (gen_neon_vst4<mode> (operands[0], operands[1]));
+ DONE;
+})
+
+;;; On MVE we use V4xYYY modes instead of XI
+(define_expand "vec_store_lanes<MVE_vld4_vst4><mode>"
+ [(set (match_operand:<MVE_VLD4_VST4> 0 "neon_struct_operand")
+ (unspec:<MVE_VLD4_VST4> [(match_operand:<MVE_VLD4_VST4> 1 "s_register_operand")
+ (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST4))]
+ "(TARGET_HAVE_MVE && VALID_MVE_STRUCT_MODE (<MVE_VLD4_VST4>mode))"
+{
+ emit_insn (gen_mve_vst4q<mode> (operands[0], operands[1]));
DONE;
})