Message ID | 20240904132650.2720446-35-christophe.lyon@linaro.org |
---|---|
State | New |
Headers | show |
Series | arm: [MVE intrinsics] Re-implement more intrinsics | expand |
On 04/09/2024 14:26, Christophe Lyon wrote: > Implement vadcq using the new MVE builtins framework. > > We re-use most of the code introduced by the previous patch to support > vadciq: we just need to initialize carry from the input parameter. > > 2024-08-28 Christophe Lyon <christophe.lyon@linaro.org> > > gcc/ > > * config/arm/arm-mve-builtins-base.cc (vadcq_vsbc): Add support > for vadcq. > * config/arm/arm-mve-builtins-base.def (vadcq): New. > * config/arm/arm-mve-builtins-base.h (vadcq): New. > * config/arm/arm_mve.h (vadcq): Delete. > (vadcq_m): Delete. > (vadcq_s32): Delete. > (vadcq_u32): Delete. > (vadcq_m_s32): Delete. > (vadcq_m_u32): Delete. > (__arm_vadcq_s32): Delete. > (__arm_vadcq_u32): Delete. > (__arm_vadcq_m_s32): Delete. > (__arm_vadcq_m_u32): Delete. > (__arm_vadcq): Delete. > (__arm_vadcq_m): Delete. > + if (!m_init_carry) > + { > + /* Prepare carry in: > + set_fpscr ( (fpscr & ~0x20000000u) > + | ((*carry & 1u) << 29) ) */ > + rtx carry_in = gen_reg_rtx (SImode); > + rtx fpscr = gen_reg_rtx (SImode); > + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); > + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); > + > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_ASHIFT (SImode, > + carry_in, > + GEN_INT (29)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_AND (SImode, > + carry_in, > + GEN_INT (0x20000000)))); > + emit_insn (gen_rtx_SET (fpscr, > + gen_rtx_AND (SImode, > + fpscr, > + GEN_INT (~0x20000000)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_IOR (SImode, > + carry_in, > + fpscr))); > + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); > + } What's the logic here? Are we just trying to set the C flag to *carry != 0 (is carry a bool?)? Do we really need to preserve all the other bits in NZCV? I wouldn't have thought so, suggesting that: CMP *carry, #1 // Set C if *carry != 0 ought to be enough, without having to generate a read-modify-write sequence. R. > --- > gcc/config/arm/arm-mve-builtins-base.cc | 61 +++++++++++++++-- > gcc/config/arm/arm-mve-builtins-base.def | 1 + > gcc/config/arm/arm-mve-builtins-base.h | 1 + > gcc/config/arm/arm_mve.h | 87 ------------------------ > 4 files changed, 56 insertions(+), 94 deletions(-) > > diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc > index 6f3b18c2915..9c2e11356ef 100644 > --- a/gcc/config/arm/arm-mve-builtins-base.cc > +++ b/gcc/config/arm/arm-mve-builtins-base.cc > @@ -559,10 +559,19 @@ public: > class vadc_vsbc_impl : public function_base > { > public: > + CONSTEXPR vadc_vsbc_impl (bool init_carry) > + : m_init_carry (init_carry) > + {} > + > + /* Initialize carry with 0 (vadci). */ > + bool m_init_carry; > + > unsigned int > call_properties (const function_instance &) const override > { > unsigned int flags = CP_WRITE_MEMORY | CP_READ_FPCR; > + if (!m_init_carry) > + flags |= CP_READ_MEMORY; > return flags; > } > > @@ -605,22 +614,59 @@ public: > carry_ptr = e.args[carry_out_arg_no]; > e.args.ordered_remove (carry_out_arg_no); > > + if (!m_init_carry) > + { > + /* Prepare carry in: > + set_fpscr ( (fpscr & ~0x20000000u) > + | ((*carry & 1u) << 29) ) */ > + rtx carry_in = gen_reg_rtx (SImode); > + rtx fpscr = gen_reg_rtx (SImode); > + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); > + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); > + > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_ASHIFT (SImode, > + carry_in, > + GEN_INT (29)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_AND (SImode, > + carry_in, > + GEN_INT (0x20000000)))); > + emit_insn (gen_rtx_SET (fpscr, > + gen_rtx_AND (SImode, > + fpscr, > + GEN_INT (~0x20000000)))); > + emit_insn (gen_rtx_SET (carry_in, > + gen_rtx_IOR (SImode, > + carry_in, > + fpscr))); > + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); > + } > + > switch (e.pred) > { > case PRED_none: > /* No predicate. */ > - unspec = e.type_suffix (0).unsigned_p > - ? VADCIQ_U > - : VADCIQ_S; > + unspec = m_init_carry > + ? (e.type_suffix (0).unsigned_p > + ? VADCIQ_U > + : VADCIQ_S) > + : (e.type_suffix (0).unsigned_p > + ? VADCQ_U > + : VADCQ_S); > code = code_for_mve_q_v4si (unspec, unspec); > insns = e.use_exact_insn (code); > break; > > case PRED_m: > /* "m" predicate. */ > - unspec = e.type_suffix (0).unsigned_p > - ? VADCIQ_M_U > - : VADCIQ_M_S; > + unspec = m_init_carry > + ? (e.type_suffix (0).unsigned_p > + ? VADCIQ_M_U > + : VADCIQ_M_S) > + : (e.type_suffix (0).unsigned_p > + ? VADCQ_M_U > + : VADCQ_M_S); > code = code_for_mve_q_m_v4si (unspec, unspec); > insns = e.use_cond_insn (code, 0); > break; > @@ -816,7 +862,8 @@ namespace arm_mve { > FUNCTION_PRED_P_S_U (vabavq, VABAVQ) > FUNCTION_WITHOUT_N (vabdq, VABDQ) > FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, -1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1)) > -FUNCTION (vadciq, vadc_vsbc_impl,) > +FUNCTION (vadciq, vadc_vsbc_impl, (true)) > +FUNCTION (vadcq, vadc_vsbc_impl, (false)) > FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ) > FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ) > FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ) > diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def > index 72d6461c4e4..37efa6bf13e 100644 > --- a/gcc/config/arm/arm-mve-builtins-base.def > +++ b/gcc/config/arm/arm-mve-builtins-base.def > @@ -22,6 +22,7 @@ DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, p_or_none) > DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none) > DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none) > DEF_MVE_FUNCTION (vadciq, vadc_vsbc, integer_32, m_or_none) > +DEF_MVE_FUNCTION (vadcq, vadc_vsbc, integer_32, m_or_none) > DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none) > DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none) > DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none) > diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h > index 2dfc2e18062..eb8423c3fe2 100644 > --- a/gcc/config/arm/arm-mve-builtins-base.h > +++ b/gcc/config/arm/arm-mve-builtins-base.h > @@ -27,6 +27,7 @@ extern const function_base *const vabavq; > extern const function_base *const vabdq; > extern const function_base *const vabsq; > extern const function_base *const vadciq; > +extern const function_base *const vadcq; > extern const function_base *const vaddlvaq; > extern const function_base *const vaddlvq; > extern const function_base *const vaddq; > diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h > index 3a0b3041c42..dd7b6f5cdab 100644 > --- a/gcc/config/arm/arm_mve.h > +++ b/gcc/config/arm/arm_mve.h > @@ -85,8 +85,6 @@ > #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) > #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) > #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value) > -#define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry) > -#define vadcq_m(__inactive, __a, __b, __carry, __p) __arm_vadcq_m(__inactive, __a, __b, __carry, __p) > #define vsbciq(__a, __b, __carry_out) __arm_vsbciq(__a, __b, __carry_out) > #define vsbciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m(__inactive, __a, __b, __carry_out, __p) > #define vsbcq(__a, __b, __carry) __arm_vsbcq(__a, __b, __carry) > @@ -319,10 +317,6 @@ > #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value) > #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value) > #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value) > -#define vadcq_s32(__a, __b, __carry) __arm_vadcq_s32(__a, __b, __carry) > -#define vadcq_u32(__a, __b, __carry) __arm_vadcq_u32(__a, __b, __carry) > -#define vadcq_m_s32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_s32(__inactive, __a, __b, __carry, __p) > -#define vadcq_m_u32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_u32(__inactive, __a, __b, __carry, __p) > #define vsbciq_s32(__a, __b, __carry_out) __arm_vsbciq_s32(__a, __b, __carry_out) > #define vsbciq_u32(__a, __b, __carry_out) __arm_vsbciq_u32(__a, __b, __carry_out) > #define vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) > @@ -1684,46 +1678,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3 > *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); > } > > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) > -{ > - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); > - uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, __p); > - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; > - return __res; > -} > - > __extension__ extern __inline int32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > __arm_vsbciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) > @@ -3600,34 +3554,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_ > __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value); > } > > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) > -{ > - return __arm_vadcq_s32 (__a, __b, __carry); > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) > -{ > - return __arm_vadcq_u32 (__a, __b, __carry); > -} > - > -__extension__ extern __inline int32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) > -{ > - return __arm_vadcq_m_s32 (__inactive, __a, __b, __carry, __p); > -} > - > -__extension__ extern __inline uint32x4_t > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > -__arm_vadcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) > -{ > - return __arm_vadcq_m_u32 (__inactive, __a, __b, __carry, __p); > -} > - > __extension__ extern __inline int32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > __arm_vsbciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) > @@ -5245,19 +5171,6 @@ extern void *__ARM_undef; > int (*)[__ARM_mve_type_int64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_s64 (__ARM_mve_coerce_s64_ptr(p0, int64_t *), p1, p2), \ > int (*)[__ARM_mve_type_uint64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_u64 (__ARM_mve_coerce_u64_ptr(p0, uint64_t *), p1, p2))) > > -#define __arm_vadcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ > - __typeof(p1) __p1 = (p1); \ > - __typeof(p2) __p2 = (p2); \ > - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ > - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ > - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) > - > -#define __arm_vadcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ > - __typeof(p1) __p1 = (p1); \ > - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ > - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ > - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) > - > #define __arm_vsbciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ > __typeof(p1) __p1 = (p1); \ > __typeof(p2) __p2 = (p2); \
On 14/10/2024 19:18, Richard Earnshaw (lists) wrote: > On 04/09/2024 14:26, Christophe Lyon wrote: >> Implement vadcq using the new MVE builtins framework. >> >> We re-use most of the code introduced by the previous patch to support >> vadciq: we just need to initialize carry from the input parameter. >> >> 2024-08-28 Christophe Lyon <christophe.lyon@linaro.org> >> >> gcc/ >> >> * config/arm/arm-mve-builtins-base.cc (vadcq_vsbc): Add support >> for vadcq. >> * config/arm/arm-mve-builtins-base.def (vadcq): New. >> * config/arm/arm-mve-builtins-base.h (vadcq): New. >> * config/arm/arm_mve.h (vadcq): Delete. >> (vadcq_m): Delete. >> (vadcq_s32): Delete. >> (vadcq_u32): Delete. >> (vadcq_m_s32): Delete. >> (vadcq_m_u32): Delete. >> (__arm_vadcq_s32): Delete. >> (__arm_vadcq_u32): Delete. >> (__arm_vadcq_m_s32): Delete. >> (__arm_vadcq_m_u32): Delete. >> (__arm_vadcq): Delete. >> (__arm_vadcq_m): Delete. > >> + if (!m_init_carry) >> + { >> + /* Prepare carry in: >> + set_fpscr ( (fpscr & ~0x20000000u) >> + | ((*carry & 1u) << 29) ) */ >> + rtx carry_in = gen_reg_rtx (SImode); >> + rtx fpscr = gen_reg_rtx (SImode); >> + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); >> + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); >> + >> + emit_insn (gen_rtx_SET (carry_in, >> + gen_rtx_ASHIFT (SImode, >> + carry_in, >> + GEN_INT (29)))); >> + emit_insn (gen_rtx_SET (carry_in, >> + gen_rtx_AND (SImode, >> + carry_in, >> + GEN_INT (0x20000000)))); >> + emit_insn (gen_rtx_SET (fpscr, >> + gen_rtx_AND (SImode, >> + fpscr, >> + GEN_INT (~0x20000000)))); >> + emit_insn (gen_rtx_SET (carry_in, >> + gen_rtx_IOR (SImode, >> + carry_in, >> + fpscr))); >> + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); >> + } > > What's the logic here? Are we just trying to set the C flag to *carry != 0 (is carry a bool?)? Do we really need to preserve all the other bits in NZCV? I wouldn't have thought so, suggesting that: > > CMP *carry, #1 // Set C if *carry != 0 > > ought to be enough, without having to generate a read-modify-write sequence. I realised last night that this is setting up the fpsr not the cpsr, so my suggestion won't work. I am concerned that expanding this too early will leave something that we can't optimize away if we have back-to-back vadcq intrinsics that chain the carry, but I guess this is no different from what we have already. On that basis, this patch is also OK. We may need to revisit this sequence later to check that we are removing redundant reads + sets. R. > > R. > >> --- >> gcc/config/arm/arm-mve-builtins-base.cc | 61 +++++++++++++++-- >> gcc/config/arm/arm-mve-builtins-base.def | 1 + >> gcc/config/arm/arm-mve-builtins-base.h | 1 + >> gcc/config/arm/arm_mve.h | 87 ------------------------ >> 4 files changed, 56 insertions(+), 94 deletions(-) >> >> diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc >> index 6f3b18c2915..9c2e11356ef 100644 >> --- a/gcc/config/arm/arm-mve-builtins-base.cc >> +++ b/gcc/config/arm/arm-mve-builtins-base.cc >> @@ -559,10 +559,19 @@ public: >> class vadc_vsbc_impl : public function_base >> { >> public: >> + CONSTEXPR vadc_vsbc_impl (bool init_carry) >> + : m_init_carry (init_carry) >> + {} >> + >> + /* Initialize carry with 0 (vadci). */ >> + bool m_init_carry; >> + >> unsigned int >> call_properties (const function_instance &) const override >> { >> unsigned int flags = CP_WRITE_MEMORY | CP_READ_FPCR; >> + if (!m_init_carry) >> + flags |= CP_READ_MEMORY; >> return flags; >> } >> >> @@ -605,22 +614,59 @@ public: >> carry_ptr = e.args[carry_out_arg_no]; >> e.args.ordered_remove (carry_out_arg_no); >> >> + if (!m_init_carry) >> + { >> + /* Prepare carry in: >> + set_fpscr ( (fpscr & ~0x20000000u) >> + | ((*carry & 1u) << 29) ) */ >> + rtx carry_in = gen_reg_rtx (SImode); >> + rtx fpscr = gen_reg_rtx (SImode); >> + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); >> + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); >> + >> + emit_insn (gen_rtx_SET (carry_in, >> + gen_rtx_ASHIFT (SImode, >> + carry_in, >> + GEN_INT (29)))); >> + emit_insn (gen_rtx_SET (carry_in, >> + gen_rtx_AND (SImode, >> + carry_in, >> + GEN_INT (0x20000000)))); >> + emit_insn (gen_rtx_SET (fpscr, >> + gen_rtx_AND (SImode, >> + fpscr, >> + GEN_INT (~0x20000000)))); >> + emit_insn (gen_rtx_SET (carry_in, >> + gen_rtx_IOR (SImode, >> + carry_in, >> + fpscr))); >> + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); >> + } >> + >> switch (e.pred) >> { >> case PRED_none: >> /* No predicate. */ >> - unspec = e.type_suffix (0).unsigned_p >> - ? VADCIQ_U >> - : VADCIQ_S; >> + unspec = m_init_carry >> + ? (e.type_suffix (0).unsigned_p >> + ? VADCIQ_U >> + : VADCIQ_S) >> + : (e.type_suffix (0).unsigned_p >> + ? VADCQ_U >> + : VADCQ_S); >> code = code_for_mve_q_v4si (unspec, unspec); >> insns = e.use_exact_insn (code); >> break; >> >> case PRED_m: >> /* "m" predicate. */ >> - unspec = e.type_suffix (0).unsigned_p >> - ? VADCIQ_M_U >> - : VADCIQ_M_S; >> + unspec = m_init_carry >> + ? (e.type_suffix (0).unsigned_p >> + ? VADCIQ_M_U >> + : VADCIQ_M_S) >> + : (e.type_suffix (0).unsigned_p >> + ? VADCQ_M_U >> + : VADCQ_M_S); >> code = code_for_mve_q_m_v4si (unspec, unspec); >> insns = e.use_cond_insn (code, 0); >> break; >> @@ -816,7 +862,8 @@ namespace arm_mve { >> FUNCTION_PRED_P_S_U (vabavq, VABAVQ) >> FUNCTION_WITHOUT_N (vabdq, VABDQ) >> FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, -1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1)) >> -FUNCTION (vadciq, vadc_vsbc_impl,) >> +FUNCTION (vadciq, vadc_vsbc_impl, (true)) >> +FUNCTION (vadcq, vadc_vsbc_impl, (false)) >> FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ) >> FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ) >> FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ) >> diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def >> index 72d6461c4e4..37efa6bf13e 100644 >> --- a/gcc/config/arm/arm-mve-builtins-base.def >> +++ b/gcc/config/arm/arm-mve-builtins-base.def >> @@ -22,6 +22,7 @@ DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, p_or_none) >> DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none) >> DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none) >> DEF_MVE_FUNCTION (vadciq, vadc_vsbc, integer_32, m_or_none) >> +DEF_MVE_FUNCTION (vadcq, vadc_vsbc, integer_32, m_or_none) >> DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none) >> DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none) >> DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none) >> diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h >> index 2dfc2e18062..eb8423c3fe2 100644 >> --- a/gcc/config/arm/arm-mve-builtins-base.h >> +++ b/gcc/config/arm/arm-mve-builtins-base.h >> @@ -27,6 +27,7 @@ extern const function_base *const vabavq; >> extern const function_base *const vabdq; >> extern const function_base *const vabsq; >> extern const function_base *const vadciq; >> +extern const function_base *const vadcq; >> extern const function_base *const vaddlvaq; >> extern const function_base *const vaddlvq; >> extern const function_base *const vaddq; >> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h >> index 3a0b3041c42..dd7b6f5cdab 100644 >> --- a/gcc/config/arm/arm_mve.h >> +++ b/gcc/config/arm/arm_mve.h >> @@ -85,8 +85,6 @@ >> #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) >> #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) >> #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value) >> -#define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry) >> -#define vadcq_m(__inactive, __a, __b, __carry, __p) __arm_vadcq_m(__inactive, __a, __b, __carry, __p) >> #define vsbciq(__a, __b, __carry_out) __arm_vsbciq(__a, __b, __carry_out) >> #define vsbciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m(__inactive, __a, __b, __carry_out, __p) >> #define vsbcq(__a, __b, __carry) __arm_vsbcq(__a, __b, __carry) >> @@ -319,10 +317,6 @@ >> #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value) >> #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value) >> #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value) >> -#define vadcq_s32(__a, __b, __carry) __arm_vadcq_s32(__a, __b, __carry) >> -#define vadcq_u32(__a, __b, __carry) __arm_vadcq_u32(__a, __b, __carry) >> -#define vadcq_m_s32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_s32(__inactive, __a, __b, __carry, __p) >> -#define vadcq_m_u32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_u32(__inactive, __a, __b, __carry, __p) >> #define vsbciq_s32(__a, __b, __carry_out) __arm_vsbciq_s32(__a, __b, __carry_out) >> #define vsbciq_u32(__a, __b, __carry_out) __arm_vsbciq_u32(__a, __b, __carry_out) >> #define vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) >> @@ -1684,46 +1678,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3 >> *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); >> } >> >> -__extension__ extern __inline int32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) >> -{ >> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >> - int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b); >> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >> - return __res; >> -} >> - >> -__extension__ extern __inline uint32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) >> -{ >> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >> - uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b); >> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >> - return __res; >> -} >> - >> -__extension__ extern __inline int32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) >> -{ >> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >> - int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p); >> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >> - return __res; >> -} >> - >> -__extension__ extern __inline uint32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) >> -{ >> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >> - uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, __p); >> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >> - return __res; >> -} >> - >> __extension__ extern __inline int32x4_t >> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> __arm_vsbciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) >> @@ -3600,34 +3554,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_ >> __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value); >> } >> >> -__extension__ extern __inline int32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) >> -{ >> - return __arm_vadcq_s32 (__a, __b, __carry); >> -} >> - >> -__extension__ extern __inline uint32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) >> -{ >> - return __arm_vadcq_u32 (__a, __b, __carry); >> -} >> - >> -__extension__ extern __inline int32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) >> -{ >> - return __arm_vadcq_m_s32 (__inactive, __a, __b, __carry, __p); >> -} >> - >> -__extension__ extern __inline uint32x4_t >> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> -__arm_vadcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) >> -{ >> - return __arm_vadcq_m_u32 (__inactive, __a, __b, __carry, __p); >> -} >> - >> __extension__ extern __inline int32x4_t >> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >> __arm_vsbciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) >> @@ -5245,19 +5171,6 @@ extern void *__ARM_undef; >> int (*)[__ARM_mve_type_int64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_s64 (__ARM_mve_coerce_s64_ptr(p0, int64_t *), p1, p2), \ >> int (*)[__ARM_mve_type_uint64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_u64 (__ARM_mve_coerce_u64_ptr(p0, uint64_t *), p1, p2))) >> >> -#define __arm_vadcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ >> - __typeof(p1) __p1 = (p1); \ >> - __typeof(p2) __p2 = (p2); \ >> - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ >> - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ >> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) >> - >> -#define __arm_vadcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ >> - __typeof(p1) __p1 = (p1); \ >> - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ >> - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ >> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) >> - >> #define __arm_vsbciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ >> __typeof(p1) __p1 = (p1); \ >> __typeof(p2) __p2 = (p2); \ >
On 10/15/24 11:18, Richard Earnshaw wrote: > On 14/10/2024 19:18, Richard Earnshaw (lists) wrote: >> On 04/09/2024 14:26, Christophe Lyon wrote: >>> Implement vadcq using the new MVE builtins framework. >>> >>> We re-use most of the code introduced by the previous patch to support >>> vadciq: we just need to initialize carry from the input parameter. >>> >>> 2024-08-28 Christophe Lyon <christophe.lyon@linaro.org> >>> >>> gcc/ >>> >>> * config/arm/arm-mve-builtins-base.cc (vadcq_vsbc): Add support >>> for vadcq. >>> * config/arm/arm-mve-builtins-base.def (vadcq): New. >>> * config/arm/arm-mve-builtins-base.h (vadcq): New. >>> * config/arm/arm_mve.h (vadcq): Delete. >>> (vadcq_m): Delete. >>> (vadcq_s32): Delete. >>> (vadcq_u32): Delete. >>> (vadcq_m_s32): Delete. >>> (vadcq_m_u32): Delete. >>> (__arm_vadcq_s32): Delete. >>> (__arm_vadcq_u32): Delete. >>> (__arm_vadcq_m_s32): Delete. >>> (__arm_vadcq_m_u32): Delete. >>> (__arm_vadcq): Delete. >>> (__arm_vadcq_m): Delete. >> >>> + if (!m_init_carry) >>> + { >>> + /* Prepare carry in: >>> + set_fpscr ( (fpscr & ~0x20000000u) >>> + | ((*carry & 1u) << 29) ) */ >>> + rtx carry_in = gen_reg_rtx (SImode); >>> + rtx fpscr = gen_reg_rtx (SImode); >>> + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); >>> + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); >>> + >>> + emit_insn (gen_rtx_SET (carry_in, >>> + gen_rtx_ASHIFT (SImode, >>> + carry_in, >>> + GEN_INT (29)))); >>> + emit_insn (gen_rtx_SET (carry_in, >>> + gen_rtx_AND (SImode, >>> + carry_in, >>> + GEN_INT (0x20000000)))); >>> + emit_insn (gen_rtx_SET (fpscr, >>> + gen_rtx_AND (SImode, >>> + fpscr, >>> + GEN_INT (~0x20000000)))); >>> + emit_insn (gen_rtx_SET (carry_in, >>> + gen_rtx_IOR (SImode, >>> + carry_in, >>> + fpscr))); >>> + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); >>> + } >> >> What's the logic here? Are we just trying to set the C flag to *carry != 0 (is carry a bool?)? Do we really need to preserve all the other bits in NZCV? I wouldn't have thought so, suggesting that: >> >> CMP *carry, #1 // Set C if *carry != 0 >> >> ought to be enough, without having to generate a read-modify-write sequence. > > I realised last night that this is setting up the fpsr not the cpsr, so my suggestion won't work. I am concerned that expanding this too early will leave something that we can't optimize away if we have back-to-back vadcq intrinsics that chain the carry, but I guess this is no different from what we have already. > Indeed, this is just replicating what the previous implementation is doing. > On that basis, this patch is also OK. We may need to revisit this sequence later to check that we are removing redundant reads + sets. > > R. > Thanks, Christophe >> >> R. >> >>> --- >>> gcc/config/arm/arm-mve-builtins-base.cc | 61 +++++++++++++++-- >>> gcc/config/arm/arm-mve-builtins-base.def | 1 + >>> gcc/config/arm/arm-mve-builtins-base.h | 1 + >>> gcc/config/arm/arm_mve.h | 87 ------------------------ >>> 4 files changed, 56 insertions(+), 94 deletions(-) >>> >>> diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc >>> index 6f3b18c2915..9c2e11356ef 100644 >>> --- a/gcc/config/arm/arm-mve-builtins-base.cc >>> +++ b/gcc/config/arm/arm-mve-builtins-base.cc >>> @@ -559,10 +559,19 @@ public: >>> class vadc_vsbc_impl : public function_base >>> { >>> public: >>> + CONSTEXPR vadc_vsbc_impl (bool init_carry) >>> + : m_init_carry (init_carry) >>> + {} >>> + >>> + /* Initialize carry with 0 (vadci). */ >>> + bool m_init_carry; >>> + >>> unsigned int >>> call_properties (const function_instance &) const override >>> { >>> unsigned int flags = CP_WRITE_MEMORY | CP_READ_FPCR; >>> + if (!m_init_carry) >>> + flags |= CP_READ_MEMORY; >>> return flags; >>> } >>> >>> @@ -605,22 +614,59 @@ public: >>> carry_ptr = e.args[carry_out_arg_no]; >>> e.args.ordered_remove (carry_out_arg_no); >>> >>> + if (!m_init_carry) >>> + { >>> + /* Prepare carry in: >>> + set_fpscr ( (fpscr & ~0x20000000u) >>> + | ((*carry & 1u) << 29) ) */ >>> + rtx carry_in = gen_reg_rtx (SImode); >>> + rtx fpscr = gen_reg_rtx (SImode); >>> + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); >>> + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); >>> + >>> + emit_insn (gen_rtx_SET (carry_in, >>> + gen_rtx_ASHIFT (SImode, >>> + carry_in, >>> + GEN_INT (29)))); >>> + emit_insn (gen_rtx_SET (carry_in, >>> + gen_rtx_AND (SImode, >>> + carry_in, >>> + GEN_INT (0x20000000)))); >>> + emit_insn (gen_rtx_SET (fpscr, >>> + gen_rtx_AND (SImode, >>> + fpscr, >>> + GEN_INT (~0x20000000)))); >>> + emit_insn (gen_rtx_SET (carry_in, >>> + gen_rtx_IOR (SImode, >>> + carry_in, >>> + fpscr))); >>> + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); >>> + } >>> + >>> switch (e.pred) >>> { >>> case PRED_none: >>> /* No predicate. */ >>> - unspec = e.type_suffix (0).unsigned_p >>> - ? VADCIQ_U >>> - : VADCIQ_S; >>> + unspec = m_init_carry >>> + ? (e.type_suffix (0).unsigned_p >>> + ? VADCIQ_U >>> + : VADCIQ_S) >>> + : (e.type_suffix (0).unsigned_p >>> + ? VADCQ_U >>> + : VADCQ_S); >>> code = code_for_mve_q_v4si (unspec, unspec); >>> insns = e.use_exact_insn (code); >>> break; >>> >>> case PRED_m: >>> /* "m" predicate. */ >>> - unspec = e.type_suffix (0).unsigned_p >>> - ? VADCIQ_M_U >>> - : VADCIQ_M_S; >>> + unspec = m_init_carry >>> + ? (e.type_suffix (0).unsigned_p >>> + ? VADCIQ_M_U >>> + : VADCIQ_M_S) >>> + : (e.type_suffix (0).unsigned_p >>> + ? VADCQ_M_U >>> + : VADCQ_M_S); >>> code = code_for_mve_q_m_v4si (unspec, unspec); >>> insns = e.use_cond_insn (code, 0); >>> break; >>> @@ -816,7 +862,8 @@ namespace arm_mve { >>> FUNCTION_PRED_P_S_U (vabavq, VABAVQ) >>> FUNCTION_WITHOUT_N (vabdq, VABDQ) >>> FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, -1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1)) >>> -FUNCTION (vadciq, vadc_vsbc_impl,) >>> +FUNCTION (vadciq, vadc_vsbc_impl, (true)) >>> +FUNCTION (vadcq, vadc_vsbc_impl, (false)) >>> FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ) >>> FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ) >>> FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ) >>> diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def >>> index 72d6461c4e4..37efa6bf13e 100644 >>> --- a/gcc/config/arm/arm-mve-builtins-base.def >>> +++ b/gcc/config/arm/arm-mve-builtins-base.def >>> @@ -22,6 +22,7 @@ DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, p_or_none) >>> DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none) >>> DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none) >>> DEF_MVE_FUNCTION (vadciq, vadc_vsbc, integer_32, m_or_none) >>> +DEF_MVE_FUNCTION (vadcq, vadc_vsbc, integer_32, m_or_none) >>> DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none) >>> DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none) >>> DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none) >>> diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h >>> index 2dfc2e18062..eb8423c3fe2 100644 >>> --- a/gcc/config/arm/arm-mve-builtins-base.h >>> +++ b/gcc/config/arm/arm-mve-builtins-base.h >>> @@ -27,6 +27,7 @@ extern const function_base *const vabavq; >>> extern const function_base *const vabdq; >>> extern const function_base *const vabsq; >>> extern const function_base *const vadciq; >>> +extern const function_base *const vadcq; >>> extern const function_base *const vaddlvaq; >>> extern const function_base *const vaddlvq; >>> extern const function_base *const vaddq; >>> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h >>> index 3a0b3041c42..dd7b6f5cdab 100644 >>> --- a/gcc/config/arm/arm_mve.h >>> +++ b/gcc/config/arm/arm_mve.h >>> @@ -85,8 +85,6 @@ >>> #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) >>> #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) >>> #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value) >>> -#define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry) >>> -#define vadcq_m(__inactive, __a, __b, __carry, __p) __arm_vadcq_m(__inactive, __a, __b, __carry, __p) >>> #define vsbciq(__a, __b, __carry_out) __arm_vsbciq(__a, __b, __carry_out) >>> #define vsbciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m(__inactive, __a, __b, __carry_out, __p) >>> #define vsbcq(__a, __b, __carry) __arm_vsbcq(__a, __b, __carry) >>> @@ -319,10 +317,6 @@ >>> #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value) >>> #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value) >>> #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value) >>> -#define vadcq_s32(__a, __b, __carry) __arm_vadcq_s32(__a, __b, __carry) >>> -#define vadcq_u32(__a, __b, __carry) __arm_vadcq_u32(__a, __b, __carry) >>> -#define vadcq_m_s32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_s32(__inactive, __a, __b, __carry, __p) >>> -#define vadcq_m_u32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_u32(__inactive, __a, __b, __carry, __p) >>> #define vsbciq_s32(__a, __b, __carry_out) __arm_vsbciq_s32(__a, __b, __carry_out) >>> #define vsbciq_u32(__a, __b, __carry_out) __arm_vsbciq_u32(__a, __b, __carry_out) >>> #define vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) >>> @@ -1684,46 +1678,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3 >>> *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); >>> } >>> >>> -__extension__ extern __inline int32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) >>> -{ >>> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >>> - int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b); >>> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >>> - return __res; >>> -} >>> - >>> -__extension__ extern __inline uint32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) >>> -{ >>> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >>> - uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b); >>> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >>> - return __res; >>> -} >>> - >>> -__extension__ extern __inline int32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) >>> -{ >>> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >>> - int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p); >>> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >>> - return __res; >>> -} >>> - >>> -__extension__ extern __inline uint32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) >>> -{ >>> - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); >>> - uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, __p); >>> - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; >>> - return __res; >>> -} >>> - >>> __extension__ extern __inline int32x4_t >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> __arm_vsbciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) >>> @@ -3600,34 +3554,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_ >>> __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value); >>> } >>> >>> -__extension__ extern __inline int32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) >>> -{ >>> - return __arm_vadcq_s32 (__a, __b, __carry); >>> -} >>> - >>> -__extension__ extern __inline uint32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) >>> -{ >>> - return __arm_vadcq_u32 (__a, __b, __carry); >>> -} >>> - >>> -__extension__ extern __inline int32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) >>> -{ >>> - return __arm_vadcq_m_s32 (__inactive, __a, __b, __carry, __p); >>> -} >>> - >>> -__extension__ extern __inline uint32x4_t >>> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> -__arm_vadcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) >>> -{ >>> - return __arm_vadcq_m_u32 (__inactive, __a, __b, __carry, __p); >>> -} >>> - >>> __extension__ extern __inline int32x4_t >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>> __arm_vsbciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) >>> @@ -5245,19 +5171,6 @@ extern void *__ARM_undef; >>> int (*)[__ARM_mve_type_int64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_s64 (__ARM_mve_coerce_s64_ptr(p0, int64_t *), p1, p2), \ >>> int (*)[__ARM_mve_type_uint64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_u64 (__ARM_mve_coerce_u64_ptr(p0, uint64_t *), p1, p2))) >>> >>> -#define __arm_vadcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ >>> - __typeof(p1) __p1 = (p1); \ >>> - __typeof(p2) __p2 = (p2); \ >>> - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ >>> - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ >>> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) >>> - >>> -#define __arm_vadcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ >>> - __typeof(p1) __p1 = (p1); \ >>> - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ >>> - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ >>> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) >>> - >>> #define __arm_vsbciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ >>> __typeof(p1) __p1 = (p1); \ >>> __typeof(p2) __p2 = (p2); \ >> >
diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc index 6f3b18c2915..9c2e11356ef 100644 --- a/gcc/config/arm/arm-mve-builtins-base.cc +++ b/gcc/config/arm/arm-mve-builtins-base.cc @@ -559,10 +559,19 @@ public: class vadc_vsbc_impl : public function_base { public: + CONSTEXPR vadc_vsbc_impl (bool init_carry) + : m_init_carry (init_carry) + {} + + /* Initialize carry with 0 (vadci). */ + bool m_init_carry; + unsigned int call_properties (const function_instance &) const override { unsigned int flags = CP_WRITE_MEMORY | CP_READ_FPCR; + if (!m_init_carry) + flags |= CP_READ_MEMORY; return flags; } @@ -605,22 +614,59 @@ public: carry_ptr = e.args[carry_out_arg_no]; e.args.ordered_remove (carry_out_arg_no); + if (!m_init_carry) + { + /* Prepare carry in: + set_fpscr ( (fpscr & ~0x20000000u) + | ((*carry & 1u) << 29) ) */ + rtx carry_in = gen_reg_rtx (SImode); + rtx fpscr = gen_reg_rtx (SImode); + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); + + emit_insn (gen_rtx_SET (carry_in, + gen_rtx_ASHIFT (SImode, + carry_in, + GEN_INT (29)))); + emit_insn (gen_rtx_SET (carry_in, + gen_rtx_AND (SImode, + carry_in, + GEN_INT (0x20000000)))); + emit_insn (gen_rtx_SET (fpscr, + gen_rtx_AND (SImode, + fpscr, + GEN_INT (~0x20000000)))); + emit_insn (gen_rtx_SET (carry_in, + gen_rtx_IOR (SImode, + carry_in, + fpscr))); + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); + } + switch (e.pred) { case PRED_none: /* No predicate. */ - unspec = e.type_suffix (0).unsigned_p - ? VADCIQ_U - : VADCIQ_S; + unspec = m_init_carry + ? (e.type_suffix (0).unsigned_p + ? VADCIQ_U + : VADCIQ_S) + : (e.type_suffix (0).unsigned_p + ? VADCQ_U + : VADCQ_S); code = code_for_mve_q_v4si (unspec, unspec); insns = e.use_exact_insn (code); break; case PRED_m: /* "m" predicate. */ - unspec = e.type_suffix (0).unsigned_p - ? VADCIQ_M_U - : VADCIQ_M_S; + unspec = m_init_carry + ? (e.type_suffix (0).unsigned_p + ? VADCIQ_M_U + : VADCIQ_M_S) + : (e.type_suffix (0).unsigned_p + ? VADCQ_M_U + : VADCQ_M_S); code = code_for_mve_q_m_v4si (unspec, unspec); insns = e.use_cond_insn (code, 0); break; @@ -816,7 +862,8 @@ namespace arm_mve { FUNCTION_PRED_P_S_U (vabavq, VABAVQ) FUNCTION_WITHOUT_N (vabdq, VABDQ) FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, -1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1)) -FUNCTION (vadciq, vadc_vsbc_impl,) +FUNCTION (vadciq, vadc_vsbc_impl, (true)) +FUNCTION (vadcq, vadc_vsbc_impl, (false)) FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ) FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ) FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ) diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def index 72d6461c4e4..37efa6bf13e 100644 --- a/gcc/config/arm/arm-mve-builtins-base.def +++ b/gcc/config/arm/arm-mve-builtins-base.def @@ -22,6 +22,7 @@ DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, p_or_none) DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none) DEF_MVE_FUNCTION (vadciq, vadc_vsbc, integer_32, m_or_none) +DEF_MVE_FUNCTION (vadcq, vadc_vsbc, integer_32, m_or_none) DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none) DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none) DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none) diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h index 2dfc2e18062..eb8423c3fe2 100644 --- a/gcc/config/arm/arm-mve-builtins-base.h +++ b/gcc/config/arm/arm-mve-builtins-base.h @@ -27,6 +27,7 @@ extern const function_base *const vabavq; extern const function_base *const vabdq; extern const function_base *const vabsq; extern const function_base *const vadciq; +extern const function_base *const vadcq; extern const function_base *const vaddlvaq; extern const function_base *const vaddlvq; extern const function_base *const vaddq; diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 3a0b3041c42..dd7b6f5cdab 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -85,8 +85,6 @@ #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value) -#define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry) -#define vadcq_m(__inactive, __a, __b, __carry, __p) __arm_vadcq_m(__inactive, __a, __b, __carry, __p) #define vsbciq(__a, __b, __carry_out) __arm_vsbciq(__a, __b, __carry_out) #define vsbciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m(__inactive, __a, __b, __carry_out, __p) #define vsbcq(__a, __b, __carry) __arm_vsbcq(__a, __b, __carry) @@ -319,10 +317,6 @@ #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value) #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value) #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value) -#define vadcq_s32(__a, __b, __carry) __arm_vadcq_s32(__a, __b, __carry) -#define vadcq_u32(__a, __b, __carry) __arm_vadcq_u32(__a, __b, __carry) -#define vadcq_m_s32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_s32(__inactive, __a, __b, __carry, __p) -#define vadcq_m_u32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_u32(__inactive, __a, __b, __carry, __p) #define vsbciq_s32(__a, __b, __carry_out) __arm_vsbciq_s32(__a, __b, __carry_out) #define vsbciq_u32(__a, __b, __carry_out) __arm_vsbciq_u32(__a, __b, __carry_out) #define vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) @@ -1684,46 +1678,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3 *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); } -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, __p); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsbciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) @@ -3600,34 +3554,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_ __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value); } -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) -{ - return __arm_vadcq_s32 (__a, __b, __carry); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) -{ - return __arm_vadcq_u32 (__a, __b, __carry); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - return __arm_vadcq_m_s32 (__inactive, __a, __b, __carry, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - return __arm_vadcq_m_u32 (__inactive, __a, __b, __carry, __p); -} - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsbciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) @@ -5245,19 +5171,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_s64 (__ARM_mve_coerce_s64_ptr(p0, int64_t *), p1, p2), \ int (*)[__ARM_mve_type_uint64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_u64 (__ARM_mve_coerce_u64_ptr(p0, uint64_t *), p1, p2))) -#define __arm_vadcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) - -#define __arm_vadcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vsbciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \