diff mbox series

[v2,25/36] arm: [MVE intrinsics] rework vdwdup viwdup

Message ID 20240904132650.2720446-26-christophe.lyon@linaro.org
State New
Headers show
Series arm: [MVE intrinsics] Re-implement more intrinsics | expand

Commit Message

Christophe Lyon Sept. 4, 2024, 1:26 p.m. UTC
Implement vdwdup and viwdup using the new MVE builtins framework.

In order to share more code with viddup_impl, the patch swaps operands
1 and 2 in @mve_v[id]wdupq_m_wb_u<mode>_insn, so that the parameter
order is similar to what @mve_v[id]dupq_m_wb_u<mode>_insn uses.

2024-08-28  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm-mve-builtins-base.cc (viddup_impl): Add support
	for wrapping versions.
	(vdwdupq): New.
	(viwdupq): New.
	* config/arm/arm-mve-builtins-base.def (vdwdupq): New.
	(viwdupq): New.
	* config/arm/arm-mve-builtins-base.h (vdwdupq): New.
	(viwdupq): New.
	* config/arm/arm_mve.h (vdwdupq_m): Delete.
	(vdwdupq_u8): Delete.
	(vdwdupq_u32): Delete.
	(vdwdupq_u16): Delete.
	(viwdupq_m): Delete.
	(viwdupq_u8): Delete.
	(viwdupq_u32): Delete.
	(viwdupq_u16): Delete.
	(vdwdupq_x_u8): Delete.
	(vdwdupq_x_u16): Delete.
	(vdwdupq_x_u32): Delete.
	(viwdupq_x_u8): Delete.
	(viwdupq_x_u16): Delete.
	(viwdupq_x_u32): Delete.
	(vdwdupq_m_n_u8): Delete.
	(vdwdupq_m_n_u32): Delete.
	(vdwdupq_m_n_u16): Delete.
	(vdwdupq_m_wb_u8): Delete.
	(vdwdupq_m_wb_u32): Delete.
	(vdwdupq_m_wb_u16): Delete.
	(vdwdupq_n_u8): Delete.
	(vdwdupq_n_u32): Delete.
	(vdwdupq_n_u16): Delete.
	(vdwdupq_wb_u8): Delete.
	(vdwdupq_wb_u32): Delete.
	(vdwdupq_wb_u16): Delete.
	(viwdupq_m_n_u8): Delete.
	(viwdupq_m_n_u32): Delete.
	(viwdupq_m_n_u16): Delete.
	(viwdupq_m_wb_u8): Delete.
	(viwdupq_m_wb_u32): Delete.
	(viwdupq_m_wb_u16): Delete.
	(viwdupq_n_u8): Delete.
	(viwdupq_n_u32): Delete.
	(viwdupq_n_u16): Delete.
	(viwdupq_wb_u8): Delete.
	(viwdupq_wb_u32): Delete.
	(viwdupq_wb_u16): Delete.
	(vdwdupq_x_n_u8): Delete.
	(vdwdupq_x_n_u16): Delete.
	(vdwdupq_x_n_u32): Delete.
	(vdwdupq_x_wb_u8): Delete.
	(vdwdupq_x_wb_u16): Delete.
	(vdwdupq_x_wb_u32): Delete.
	(viwdupq_x_n_u8): Delete.
	(viwdupq_x_n_u16): Delete.
	(viwdupq_x_n_u32): Delete.
	(viwdupq_x_wb_u8): Delete.
	(viwdupq_x_wb_u16): Delete.
	(viwdupq_x_wb_u32): Delete.
	(__arm_vdwdupq_m_n_u8): Delete.
	(__arm_vdwdupq_m_n_u32): Delete.
	(__arm_vdwdupq_m_n_u16): Delete.
	(__arm_vdwdupq_m_wb_u8): Delete.
	(__arm_vdwdupq_m_wb_u32): Delete.
	(__arm_vdwdupq_m_wb_u16): Delete.
	(__arm_vdwdupq_n_u8): Delete.
	(__arm_vdwdupq_n_u32): Delete.
	(__arm_vdwdupq_n_u16): Delete.
	(__arm_vdwdupq_wb_u8): Delete.
	(__arm_vdwdupq_wb_u32): Delete.
	(__arm_vdwdupq_wb_u16): Delete.
	(__arm_viwdupq_m_n_u8): Delete.
	(__arm_viwdupq_m_n_u32): Delete.
	(__arm_viwdupq_m_n_u16): Delete.
	(__arm_viwdupq_m_wb_u8): Delete.
	(__arm_viwdupq_m_wb_u32): Delete.
	(__arm_viwdupq_m_wb_u16): Delete.
	(__arm_viwdupq_n_u8): Delete.
	(__arm_viwdupq_n_u32): Delete.
	(__arm_viwdupq_n_u16): Delete.
	(__arm_viwdupq_wb_u8): Delete.
	(__arm_viwdupq_wb_u32): Delete.
	(__arm_viwdupq_wb_u16): Delete.
	(__arm_vdwdupq_x_n_u8): Delete.
	(__arm_vdwdupq_x_n_u16): Delete.
	(__arm_vdwdupq_x_n_u32): Delete.
	(__arm_vdwdupq_x_wb_u8): Delete.
	(__arm_vdwdupq_x_wb_u16): Delete.
	(__arm_vdwdupq_x_wb_u32): Delete.
	(__arm_viwdupq_x_n_u8): Delete.
	(__arm_viwdupq_x_n_u16): Delete.
	(__arm_viwdupq_x_n_u32): Delete.
	(__arm_viwdupq_x_wb_u8): Delete.
	(__arm_viwdupq_x_wb_u16): Delete.
	(__arm_viwdupq_x_wb_u32): Delete.
	(__arm_vdwdupq_m): Delete.
	(__arm_vdwdupq_u8): Delete.
	(__arm_vdwdupq_u32): Delete.
	(__arm_vdwdupq_u16): Delete.
	(__arm_viwdupq_m): Delete.
	(__arm_viwdupq_u8): Delete.
	(__arm_viwdupq_u32): Delete.
	(__arm_viwdupq_u16): Delete.
	(__arm_vdwdupq_x_u8): Delete.
	(__arm_vdwdupq_x_u16): Delete.
	(__arm_vdwdupq_x_u32): Delete.
	(__arm_viwdupq_x_u8): Delete.
	(__arm_viwdupq_x_u16): Delete.
	(__arm_viwdupq_x_u32): Delete.
	* config/arm/mve.md (@mve_<mve_insn>q_m_wb_u<mode>_insn): Swap
	operands 1 and 2.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |  62 +-
 gcc/config/arm/arm-mve-builtins-base.def |   2 +
 gcc/config/arm/arm-mve-builtins-base.h   |   2 +
 gcc/config/arm/arm_mve.h                 | 714 -----------------------
 gcc/config/arm/mve.md                    |  10 +-
 5 files changed, 53 insertions(+), 737 deletions(-)

Comments

Richard Earnshaw (lists) Oct. 14, 2024, 5:49 p.m. UTC | #1
On 04/09/2024 14:26, Christophe Lyon wrote:
> Implement vdwdup and viwdup using the new MVE builtins framework.
> 
> In order to share more code with viddup_impl, the patch swaps operands
> 1 and 2 in @mve_v[id]wdupq_m_wb_u<mode>_insn, so that the parameter
> order is similar to what @mve_v[id]dupq_m_wb_u<mode>_insn uses.
> 
> 2024-08-28  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/arm-mve-builtins-base.cc (viddup_impl): Add support
> 	for wrapping versions.
> 	(vdwdupq): New.
> 	(viwdupq): New.
> 	* config/arm/arm-mve-builtins-base.def (vdwdupq): New.
> 	(viwdupq): New.
> 	* config/arm/arm-mve-builtins-base.h (vdwdupq): New.
> 	(viwdupq): New.
> 	* config/arm/arm_mve.h (vdwdupq_m): Delete.
> 	(vdwdupq_u8): Delete.
> 	(vdwdupq_u32): Delete.
> 	(vdwdupq_u16): Delete.
> 	(viwdupq_m): Delete.
> 	(viwdupq_u8): Delete.
> 	(viwdupq_u32): Delete.
> 	(viwdupq_u16): Delete.
> 	(vdwdupq_x_u8): Delete.
> 	(vdwdupq_x_u16): Delete.
> 	(vdwdupq_x_u32): Delete.
> 	(viwdupq_x_u8): Delete.
> 	(viwdupq_x_u16): Delete.
> 	(viwdupq_x_u32): Delete.
> 	(vdwdupq_m_n_u8): Delete.
> 	(vdwdupq_m_n_u32): Delete.
> 	(vdwdupq_m_n_u16): Delete.
> 	(vdwdupq_m_wb_u8): Delete.
> 	(vdwdupq_m_wb_u32): Delete.
> 	(vdwdupq_m_wb_u16): Delete.
> 	(vdwdupq_n_u8): Delete.
> 	(vdwdupq_n_u32): Delete.
> 	(vdwdupq_n_u16): Delete.
> 	(vdwdupq_wb_u8): Delete.
> 	(vdwdupq_wb_u32): Delete.
> 	(vdwdupq_wb_u16): Delete.
> 	(viwdupq_m_n_u8): Delete.
> 	(viwdupq_m_n_u32): Delete.
> 	(viwdupq_m_n_u16): Delete.
> 	(viwdupq_m_wb_u8): Delete.
> 	(viwdupq_m_wb_u32): Delete.
> 	(viwdupq_m_wb_u16): Delete.
> 	(viwdupq_n_u8): Delete.
> 	(viwdupq_n_u32): Delete.
> 	(viwdupq_n_u16): Delete.
> 	(viwdupq_wb_u8): Delete.
> 	(viwdupq_wb_u32): Delete.
> 	(viwdupq_wb_u16): Delete.
> 	(vdwdupq_x_n_u8): Delete.
> 	(vdwdupq_x_n_u16): Delete.
> 	(vdwdupq_x_n_u32): Delete.
> 	(vdwdupq_x_wb_u8): Delete.
> 	(vdwdupq_x_wb_u16): Delete.
> 	(vdwdupq_x_wb_u32): Delete.
> 	(viwdupq_x_n_u8): Delete.
> 	(viwdupq_x_n_u16): Delete.
> 	(viwdupq_x_n_u32): Delete.
> 	(viwdupq_x_wb_u8): Delete.
> 	(viwdupq_x_wb_u16): Delete.
> 	(viwdupq_x_wb_u32): Delete.
> 	(__arm_vdwdupq_m_n_u8): Delete.
> 	(__arm_vdwdupq_m_n_u32): Delete.
> 	(__arm_vdwdupq_m_n_u16): Delete.
> 	(__arm_vdwdupq_m_wb_u8): Delete.
> 	(__arm_vdwdupq_m_wb_u32): Delete.
> 	(__arm_vdwdupq_m_wb_u16): Delete.
> 	(__arm_vdwdupq_n_u8): Delete.
> 	(__arm_vdwdupq_n_u32): Delete.
> 	(__arm_vdwdupq_n_u16): Delete.
> 	(__arm_vdwdupq_wb_u8): Delete.
> 	(__arm_vdwdupq_wb_u32): Delete.
> 	(__arm_vdwdupq_wb_u16): Delete.
> 	(__arm_viwdupq_m_n_u8): Delete.
> 	(__arm_viwdupq_m_n_u32): Delete.
> 	(__arm_viwdupq_m_n_u16): Delete.
> 	(__arm_viwdupq_m_wb_u8): Delete.
> 	(__arm_viwdupq_m_wb_u32): Delete.
> 	(__arm_viwdupq_m_wb_u16): Delete.
> 	(__arm_viwdupq_n_u8): Delete.
> 	(__arm_viwdupq_n_u32): Delete.
> 	(__arm_viwdupq_n_u16): Delete.
> 	(__arm_viwdupq_wb_u8): Delete.
> 	(__arm_viwdupq_wb_u32): Delete.
> 	(__arm_viwdupq_wb_u16): Delete.
> 	(__arm_vdwdupq_x_n_u8): Delete.
> 	(__arm_vdwdupq_x_n_u16): Delete.
> 	(__arm_vdwdupq_x_n_u32): Delete.
> 	(__arm_vdwdupq_x_wb_u8): Delete.
> 	(__arm_vdwdupq_x_wb_u16): Delete.
> 	(__arm_vdwdupq_x_wb_u32): Delete.
> 	(__arm_viwdupq_x_n_u8): Delete.
> 	(__arm_viwdupq_x_n_u16): Delete.
> 	(__arm_viwdupq_x_n_u32): Delete.
> 	(__arm_viwdupq_x_wb_u8): Delete.
> 	(__arm_viwdupq_x_wb_u16): Delete.
> 	(__arm_viwdupq_x_wb_u32): Delete.
> 	(__arm_vdwdupq_m): Delete.
> 	(__arm_vdwdupq_u8): Delete.
> 	(__arm_vdwdupq_u32): Delete.
> 	(__arm_vdwdupq_u16): Delete.
> 	(__arm_viwdupq_m): Delete.
> 	(__arm_viwdupq_u8): Delete.
> 	(__arm_viwdupq_u32): Delete.
> 	(__arm_viwdupq_u16): Delete.
> 	(__arm_vdwdupq_x_u8): Delete.
> 	(__arm_vdwdupq_x_u16): Delete.
> 	(__arm_vdwdupq_x_u32): Delete.
> 	(__arm_viwdupq_x_u8): Delete.
> 	(__arm_viwdupq_x_u16): Delete.
> 	(__arm_viwdupq_x_u32): Delete.
> 	* config/arm/mve.md (@mve_<mve_insn>q_m_wb_u<mode>_insn): Swap
> 	operands 1 and 2.

OK.

R.

> ---
>  gcc/config/arm/arm-mve-builtins-base.cc  |  62 +-
>  gcc/config/arm/arm-mve-builtins-base.def |   2 +
>  gcc/config/arm/arm-mve-builtins-base.h   |   2 +
>  gcc/config/arm/arm_mve.h                 | 714 -----------------------
>  gcc/config/arm/mve.md                    |  10 +-
>  5 files changed, 53 insertions(+), 737 deletions(-)
> 
> diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc
> index 3d8bcdabe24..eaf054d9823 100644
> --- a/gcc/config/arm/arm-mve-builtins-base.cc
> +++ b/gcc/config/arm/arm-mve-builtins-base.cc
> @@ -354,16 +354,19 @@ public:
>     vector mode associated with type suffix 0.  We need this special case
>     because in MODE_wb the builtins derefrence the first parameter and update
>     its contents.  We also have to insert the two additional parameters needed
> -   by the builtins compared to the intrinsics.  */
> +   by the builtins compared to the intrinsics.  In wrapping mode, we have to
> +   match the 'hack' to make sure the 'wrap' parameters is in odd register.  */
>  class viddup_impl : public function_base
>  {
>  public:
> -  CONSTEXPR viddup_impl (bool inc_dec)
> -    : m_inc_dec (inc_dec)
> +  CONSTEXPR viddup_impl (bool inc_dec, bool wrap)
> +    : m_inc_dec (inc_dec), m_wrap (wrap)
>    {}
>  
>    /* Increment (true) or decrement (false).  */
>    bool m_inc_dec;
> +  /* v[id]wdup (true) or v[id]dup (false).  */
> +  bool m_wrap;
>  
>    unsigned int
>    call_properties (const function_instance &fi) const override
> @@ -388,7 +391,6 @@ public:
>      rtx insns, offset_ptr;
>      rtx new_offset;
>      int offset_arg_no;
> -    rtx incr, total_incr;
>  
>      if (! e.type_suffix (0).integer_p)
>        gcc_unreachable ();
> @@ -412,15 +414,29 @@ public:
>      /* We have to shuffle parameters because the builtin needs additional
>         arguments:
>         - the updated "new_offset"
> -       - total increment (incr * number of lanes)  */
> +       - total increment (incr * number of lanes) in the non-wrapping case
> +       - hack to pass wrap in the top end of DImode operand so that it is
> +         actually in a odd register  */
>      new_offset = gen_reg_rtx (SImode);
>      e.args.quick_insert (offset_arg_no, new_offset);
>  
> -    incr = e.args[offset_arg_no + 2];
> -    total_incr = gen_int_mode (INTVAL (incr)
> -			       * GET_MODE_NUNITS (e.vector_mode (0)),
> -			       SImode);
> -    e.args.quick_push (total_incr);
> +    if (m_wrap)
> +      {
> +	rtx wrap = gen_reg_rtx (DImode);
> +	emit_insn (gen_rtx_SET (gen_rtx_SUBREG (SImode, wrap, 4),
> +				e.args[offset_arg_no + 2]));
> +	emit_insn (gen_rtx_SET (gen_rtx_SUBREG (SImode, wrap, 0),
> +				GEN_INT (0)));
> +	e.args[offset_arg_no + 2] = wrap;
> +      }
> +    else
> +      {
> +	rtx incr = e.args[offset_arg_no + 2];
> +	rtx total_incr = gen_int_mode (INTVAL (incr)
> +				       * GET_MODE_NUNITS (e.vector_mode (0)),
> +				       SImode);
> +	e.args.quick_push (total_incr);
> +      }
>  
>      /* _wb mode uses the _n builtins and adds code to update the
>         offset.  */
> @@ -428,18 +444,26 @@ public:
>        {
>        case PRED_none:
>  	/* No predicate.  */
> -	code = m_inc_dec
> -	  ? code_for_mve_q_u_insn (VIDUPQ, mode)
> -	  : code_for_mve_q_u_insn (VDDUPQ, mode);
> +	code = m_wrap
> +	  ? (m_inc_dec
> +	     ? code_for_mve_q_wb_u_insn (VIWDUPQ, mode)
> +	     : code_for_mve_q_wb_u_insn (VDWDUPQ, mode))
> +	  : (m_inc_dec
> +	     ? code_for_mve_q_u_insn (VIDUPQ, mode)
> +	     : code_for_mve_q_u_insn (VDDUPQ, mode));
>  	insns = e.use_exact_insn (code);
>  	break;
>  
>        case PRED_m:
>        case PRED_x:
>  	/* "m" or "x" predicate.  */
> -	code = m_inc_dec
> -	  ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
> -	  : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode);
> +	code = m_wrap
> +	  ? (m_inc_dec
> +	     ? code_for_mve_q_m_wb_u_insn (VIWDUPQ_M, mode)
> +	     : code_for_mve_q_m_wb_u_insn (VDWDUPQ_M, mode))
> +	  : (m_inc_dec
> +	     ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
> +	     : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode));
>  
>  	if (e.pred == PRED_m)
>  	  insns = e.use_cond_insn (code, 0);
> @@ -671,9 +695,11 @@ FUNCTION_WITHOUT_N_NO_F (vcvtnq, VCVTNQ)
>  FUNCTION_WITHOUT_N_NO_F (vcvtpq, VCVTPQ)
>  FUNCTION (vcvtbq, vcvtxq_impl, (VCVTBQ_F16_F32, VCVTBQ_M_F16_F32, VCVTBQ_F32_F16, VCVTBQ_M_F32_F16))
>  FUNCTION (vcvttq, vcvtxq_impl, (VCVTTQ_F16_F32, VCVTTQ_M_F16_F32, VCVTTQ_F32_F16, VCVTTQ_M_F32_F16))
> -FUNCTION (vddupq, viddup_impl, (false))
> +FUNCTION (vddupq, viddup_impl, (false, false))
>  FUNCTION_ONLY_N (vdupq, VDUPQ)
> -FUNCTION (vidupq, viddup_impl, (true))
> +FUNCTION (vdwdupq, viddup_impl, (false, true))
> +FUNCTION (vidupq, viddup_impl, (true, false))
> +FUNCTION (viwdupq, viddup_impl, (true, true))
>  FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ)
>  FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F))
>  FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F))
> diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def
> index ed3048e219a..c5f1e8a197b 100644
> --- a/gcc/config/arm/arm-mve-builtins-base.def
> +++ b/gcc/config/arm/arm-mve-builtins-base.def
> @@ -48,12 +48,14 @@ DEF_MVE_FUNCTION (vctp64q, vctp, none, m_or_none)
>  DEF_MVE_FUNCTION (vctp8q, vctp, none, m_or_none)
>  DEF_MVE_FUNCTION (vddupq, viddup, all_unsigned, mx_or_none)
>  DEF_MVE_FUNCTION (vdupq, unary_n, all_integer, mx_or_none)
> +DEF_MVE_FUNCTION (vdwdupq, vidwdup, all_unsigned, mx_or_none)
>  DEF_MVE_FUNCTION (veorq, binary, all_integer, mx_or_none)
>  DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none)
>  DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none)
>  DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none)
>  DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
>  DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
> +DEF_MVE_FUNCTION (viwdupq, vidwdup, all_unsigned, mx_or_none)
>  DEF_MVE_FUNCTION (vld1q, load, all_integer, none)
>  DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none)
>  DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none)
> diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h
> index 526e0f8ee3a..ed8761318bb 100644
> --- a/gcc/config/arm/arm-mve-builtins-base.h
> +++ b/gcc/config/arm/arm-mve-builtins-base.h
> @@ -68,6 +68,7 @@ extern const function_base *const vcvtq;
>  extern const function_base *const vcvttq;
>  extern const function_base *const vddupq;
>  extern const function_base *const vdupq;
> +extern const function_base *const vdwdupq;
>  extern const function_base *const veorq;
>  extern const function_base *const vfmaq;
>  extern const function_base *const vfmasq;
> @@ -77,6 +78,7 @@ extern const function_base *const vhcaddq_rot270;
>  extern const function_base *const vhcaddq_rot90;
>  extern const function_base *const vhsubq;
>  extern const function_base *const vidupq;
> +extern const function_base *const viwdupq;
>  extern const function_base *const vld1q;
>  extern const function_base *const vmaxaq;
>  extern const function_base *const vmaxavq;
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index c3da491b9d1..37b0fedc4ff 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -82,24 +82,10 @@
>  #define vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p) __arm_vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p)
>  #define vstrwq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrwq_scatter_shifted_offset(__base, __offset, __value)
>  #define vuninitializedq(__v) __arm_vuninitializedq(__v)
> -#define vdwdupq_m(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m(__inactive, __a, __b, __imm, __p)
> -#define vdwdupq_u8(__a, __b, __imm) __arm_vdwdupq_u8(__a, __b, __imm)
> -#define vdwdupq_u32(__a, __b, __imm) __arm_vdwdupq_u32(__a, __b, __imm)
> -#define vdwdupq_u16(__a, __b, __imm) __arm_vdwdupq_u16(__a, __b, __imm)
> -#define viwdupq_m(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m(__inactive, __a, __b, __imm, __p)
> -#define viwdupq_u8(__a, __b, __imm) __arm_viwdupq_u8(__a, __b, __imm)
> -#define viwdupq_u32(__a, __b, __imm) __arm_viwdupq_u32(__a, __b, __imm)
> -#define viwdupq_u16(__a, __b, __imm) __arm_viwdupq_u16(__a, __b, __imm)
>  #define vstrdq_scatter_base_wb(__addr, __offset, __value) __arm_vstrdq_scatter_base_wb(__addr, __offset, __value)
>  #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p)
>  #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p)
>  #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value)
> -#define vdwdupq_x_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_u8(__a, __b, __imm, __p)
> -#define vdwdupq_x_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_u16(__a, __b, __imm, __p)
> -#define vdwdupq_x_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_u32(__a, __b, __imm, __p)
> -#define viwdupq_x_u8(__a, __b, __imm, __p) __arm_viwdupq_x_u8(__a, __b, __imm, __p)
> -#define viwdupq_x_u16(__a, __b, __imm, __p) __arm_viwdupq_x_u16(__a, __b, __imm, __p)
> -#define viwdupq_x_u32(__a, __b, __imm, __p) __arm_viwdupq_x_u32(__a, __b, __imm, __p)
>  #define vadciq(__a, __b, __carry_out) __arm_vadciq(__a, __b, __carry_out)
>  #define vadciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vadciq_m(__inactive, __a, __b, __carry_out, __p)
>  #define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry)
> @@ -323,30 +309,6 @@
>  #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void)
>  #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void)
>  #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void)
> -#define vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
> -#define vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
> -#define vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
> -#define vdwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p) __arm_vdwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p)
> -#define vdwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p) __arm_vdwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p)
> -#define vdwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p) __arm_vdwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p)
> -#define vdwdupq_n_u8(__a, __b,  __imm) __arm_vdwdupq_n_u8(__a, __b,  __imm)
> -#define vdwdupq_n_u32(__a, __b,  __imm) __arm_vdwdupq_n_u32(__a, __b,  __imm)
> -#define vdwdupq_n_u16(__a, __b,  __imm) __arm_vdwdupq_n_u16(__a, __b,  __imm)
> -#define vdwdupq_wb_u8( __a, __b,  __imm) __arm_vdwdupq_wb_u8( __a, __b,  __imm)
> -#define vdwdupq_wb_u32( __a, __b,  __imm) __arm_vdwdupq_wb_u32( __a, __b,  __imm)
> -#define vdwdupq_wb_u16( __a, __b,  __imm) __arm_vdwdupq_wb_u16( __a, __b,  __imm)
> -#define viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
> -#define viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
> -#define viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
> -#define viwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p) __arm_viwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p)
> -#define viwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p) __arm_viwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p)
> -#define viwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p) __arm_viwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p)
> -#define viwdupq_n_u8(__a, __b,  __imm) __arm_viwdupq_n_u8(__a, __b,  __imm)
> -#define viwdupq_n_u32(__a, __b,  __imm) __arm_viwdupq_n_u32(__a, __b,  __imm)
> -#define viwdupq_n_u16(__a, __b,  __imm) __arm_viwdupq_n_u16(__a, __b,  __imm)
> -#define viwdupq_wb_u8( __a, __b,  __imm) __arm_viwdupq_wb_u8( __a, __b,  __imm)
> -#define viwdupq_wb_u32( __a, __b,  __imm) __arm_viwdupq_wb_u32( __a, __b,  __imm)
> -#define viwdupq_wb_u16( __a, __b,  __imm) __arm_viwdupq_wb_u16( __a, __b,  __imm)
>  #define vldrdq_gather_base_wb_s64(__addr, __offset) __arm_vldrdq_gather_base_wb_s64(__addr, __offset)
>  #define vldrdq_gather_base_wb_u64(__addr, __offset) __arm_vldrdq_gather_base_wb_u64(__addr, __offset)
>  #define vldrdq_gather_base_wb_z_s64(__addr, __offset, __p) __arm_vldrdq_gather_base_wb_z_s64(__addr, __offset, __p)
> @@ -367,18 +329,6 @@
>  #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value)
>  #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value)
>  #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value)
> -#define vdwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u8(__a, __b,  __imm, __p)
> -#define vdwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u16(__a, __b,  __imm, __p)
> -#define vdwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u32(__a, __b,  __imm, __p)
> -#define vdwdupq_x_wb_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u8(__a, __b,  __imm, __p)
> -#define vdwdupq_x_wb_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u16(__a, __b,  __imm, __p)
> -#define vdwdupq_x_wb_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u32(__a, __b,  __imm, __p)
> -#define viwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u8(__a, __b,  __imm, __p)
> -#define viwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u16(__a, __b,  __imm, __p)
> -#define viwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u32(__a, __b,  __imm, __p)
> -#define viwdupq_x_wb_u8(__a, __b,  __imm, __p) __arm_viwdupq_x_wb_u8(__a, __b,  __imm, __p)
> -#define viwdupq_x_wb_u16(__a, __b,  __imm, __p) __arm_viwdupq_x_wb_u16(__a, __b,  __imm, __p)
> -#define viwdupq_x_wb_u32(__a, __b,  __imm, __p) __arm_viwdupq_x_wb_u32(__a, __b,  __imm, __p)
>  #define vadciq_s32(__a, __b,  __carry_out) __arm_vadciq_s32(__a, __b,  __carry_out)
>  #define vadciq_u32(__a, __b,  __carry_out) __arm_vadciq_u32(__a, __b,  __carry_out)
>  #define vadciq_m_s32(__inactive, __a, __b,  __carry_out, __p) __arm_vadciq_m_s32(__inactive, __a, __b,  __carry_out, __p)
> @@ -1672,223 +1622,6 @@ __arm_vstrwq_scatter_shifted_offset_u32 (uint32_t * __base, uint32x4_t __offset,
>    __builtin_mve_vstrwq_scatter_shifted_offset_uv4si ((__builtin_neon_si *) __base, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint8x16_t __res =  __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p);
> -  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p);
> -  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p);
> -  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_n_uv16qi (__a, __c, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_n_uv4si (__a, __c, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_n_uv8hi (__a, __c, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint8x16_t __res = __builtin_mve_vdwdupq_n_uv16qi (*__a, __c, __imm);
> -  *__a = __builtin_mve_vdwdupq_wb_uv16qi (*__a, __c, __imm);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint32x4_t __res = __builtin_mve_vdwdupq_n_uv4si (*__a, __c, __imm);
> -  *__a = __builtin_mve_vdwdupq_wb_uv4si (*__a, __c, __imm);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint16x8_t __res = __builtin_mve_vdwdupq_n_uv8hi (*__a, __c, __imm);
> -  *__a = __builtin_mve_vdwdupq_wb_uv8hi (*__a, __c, __imm);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p);
> -  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p);
> -  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p);
> -  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_n_uv16qi (__a, __c, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_n_uv4si (__a, __c, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_n_uv8hi (__a, __c, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint8x16_t __res = __builtin_mve_viwdupq_n_uv16qi (*__a, __c, __imm);
> -  *__a = __builtin_mve_viwdupq_wb_uv16qi (*__a, __c, __imm);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint32x4_t __res = __builtin_mve_viwdupq_n_uv4si (*__a, __c, __imm);
> -  *__a = __builtin_mve_viwdupq_wb_uv4si (*__a, __c, __imm);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint16x8_t __res = __builtin_mve_viwdupq_n_uv8hi (*__a, __c, __imm);
> -  *__a = __builtin_mve_viwdupq_wb_uv8hi (*__a, __c, __imm);
> -  return __res;
> -}
> -
> -
>  __extension__ extern __inline int64x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vldrdq_gather_base_wb_s64 (uint64x2_t * __addr, const int __offset)
> @@ -2025,120 +1758,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3
>    *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_vdwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
> -  uint8x16_t __res = __builtin_mve_vdwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p);
> -  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
> -  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p);
> -  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
> -  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p);
> -  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  return __builtin_mve_viwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
> -  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p);
> -  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
> -  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p);
> -  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> -  uint64_t __c = ((uint64_t) __b) << 32;
> -  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
> -  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p);
> -  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p);
> -  return __res;
> -}
> -
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vadciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out)
> @@ -4131,174 +3750,6 @@ __arm_vstrwq_scatter_shifted_offset (uint32_t * __base, uint32x4_t __offset, uin
>   __arm_vstrwq_scatter_shifted_offset_u32 (__base, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_m_n_u8 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_m_n_u32 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_m_n_u16 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_m_wb_u8 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_m_wb_u32 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_m (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_m_wb_u16 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_u8 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> - return __arm_vdwdupq_n_u8 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_u32 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> - return __arm_vdwdupq_n_u32 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_u16 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> - return __arm_vdwdupq_n_u16 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_u8 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> - return __arm_vdwdupq_wb_u8 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_u32 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> - return __arm_vdwdupq_wb_u32 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> - return __arm_vdwdupq_wb_u16 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_m_n_u8 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_m_n_u32 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_m_n_u16 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_m_wb_u8 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_m_wb_u32 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_m (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_m_wb_u16 (__inactive, __a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_u8 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> - return __arm_viwdupq_n_u8 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_u32 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> - return __arm_viwdupq_n_u32 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_u16 (uint32_t __a, uint32_t __b, const int __imm)
> -{
> - return __arm_viwdupq_n_u16 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_u8 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> - return __arm_viwdupq_wb_u8 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_u32 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> - return __arm_viwdupq_wb_u32 (__a, __b, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm)
> -{
> - return __arm_viwdupq_wb_u16 (__a, __b, __imm);
> -}
> -
>  __extension__ extern __inline void
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vstrdq_scatter_base_wb (uint64x2_t * __addr, const int __offset, int64x2_t __value)
> @@ -4355,90 +3806,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_
>   __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_x_n_u8 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_x_n_u16 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_x_n_u32 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_x_wb_u8 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_x_wb_u16 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vdwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vdwdupq_x_wb_u32 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_x_n_u8 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_x_n_u16 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_x_n_u32 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_x_wb_u8 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_x_wb_u16 (__a, __b, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_viwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_viwdupq_x_wb_u32 (__a, __b, __imm, __p);
> -}
> -
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vadciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out)
> @@ -6146,37 +5513,6 @@ extern void *__ARM_undef;
>  #endif /* MVE Integer.  */
>  
>  
> -
> -#define __arm_vdwdupq_x_u8(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u8 ((uint32_t) __p1, p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_vdwdupq_x_u16(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u16 ((uint32_t) __p1, p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_vdwdupq_x_u32(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_viwdupq_x_u8(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u8 ((uint32_t) __p1, p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_viwdupq_x_u16(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u16 ((uint32_t) __p1, p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_viwdupq_x_u32(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
>  #define __arm_vadciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
>    __typeof(p1) __p1 = (p1); \
>    _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
> @@ -6279,56 +5615,6 @@ extern void *__ARM_undef;
>    int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
>    int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
>  
> -#define __arm_viwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \
> -  __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_viwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16 (__ARM_mve_coerce_i_scalar(__p0, int), p1, (const int) p2), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, (const int) p2));})
> -
> -#define __arm_viwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u32 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
> -
> -#define __arm_viwdupq_u8(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u8 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
> -
> -#define __arm_vdwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \
> -  __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
> -  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
> -
> -#define __arm_vdwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u16 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
> -
> -#define __arm_vdwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u32 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
> -
> -#define __arm_vdwdupq_u8(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u8 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
> -
>  #define __arm_vshlcq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
>    _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
>    int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1, p2, p3), \
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index be3be67a144..72a7e4dc868 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -5219,14 +5219,14 @@ (define_expand "mve_vdwdupq_m_wb_u<mode>"
>  ;;
>  (define_insn "@mve_<mve_insn>q_m_wb_u<mode>_insn"
>    [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
> -	(unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0")
> -		       (match_operand:SI 3 "s_register_operand" "1")
> +	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
> +		       (match_operand:SI 3 "s_register_operand" "2")
>  		       (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4)
>  		       (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg")
>  		       (match_operand:<MVE_VPRED> 6 "vpr_register_operand" "Up")]
>  	 VIDWDUPQ_M))
> -   (set (match_operand:SI 1 "s_register_operand" "=Te")
> -	(unspec:SI [(match_dup 2)
> +   (set (match_operand:SI 2 "s_register_operand" "=Te")
> +	(unspec:SI [(match_dup 1)
>  		    (match_dup 3)
>  		    (subreg:SI (match_dup 4) 4)
>  		    (match_dup 5)
> @@ -5234,7 +5234,7 @@ (define_insn "@mve_<mve_insn>q_m_wb_u<mode>_insn"
>  	 VIDWDUPQ_M))
>    ]
>    "TARGET_HAVE_MVE"
> -  "vpst\;<mve_insn>t.u%#<V_sz_elem>\t%q2, %3, %R4, %5"
> +  "vpst\;<mve_insn>t.u%#<V_sz_elem>\t%q1, %3, %R4, %5"
>   [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_wb_u<mode>_insn"))
>    (set_attr "type" "mve_move")
>    (set_attr "length""8")])
diff mbox series

Patch

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc
index 3d8bcdabe24..eaf054d9823 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -354,16 +354,19 @@  public:
    vector mode associated with type suffix 0.  We need this special case
    because in MODE_wb the builtins derefrence the first parameter and update
    its contents.  We also have to insert the two additional parameters needed
-   by the builtins compared to the intrinsics.  */
+   by the builtins compared to the intrinsics.  In wrapping mode, we have to
+   match the 'hack' to make sure the 'wrap' parameters is in odd register.  */
 class viddup_impl : public function_base
 {
 public:
-  CONSTEXPR viddup_impl (bool inc_dec)
-    : m_inc_dec (inc_dec)
+  CONSTEXPR viddup_impl (bool inc_dec, bool wrap)
+    : m_inc_dec (inc_dec), m_wrap (wrap)
   {}
 
   /* Increment (true) or decrement (false).  */
   bool m_inc_dec;
+  /* v[id]wdup (true) or v[id]dup (false).  */
+  bool m_wrap;
 
   unsigned int
   call_properties (const function_instance &fi) const override
@@ -388,7 +391,6 @@  public:
     rtx insns, offset_ptr;
     rtx new_offset;
     int offset_arg_no;
-    rtx incr, total_incr;
 
     if (! e.type_suffix (0).integer_p)
       gcc_unreachable ();
@@ -412,15 +414,29 @@  public:
     /* We have to shuffle parameters because the builtin needs additional
        arguments:
        - the updated "new_offset"
-       - total increment (incr * number of lanes)  */
+       - total increment (incr * number of lanes) in the non-wrapping case
+       - hack to pass wrap in the top end of DImode operand so that it is
+         actually in a odd register  */
     new_offset = gen_reg_rtx (SImode);
     e.args.quick_insert (offset_arg_no, new_offset);
 
-    incr = e.args[offset_arg_no + 2];
-    total_incr = gen_int_mode (INTVAL (incr)
-			       * GET_MODE_NUNITS (e.vector_mode (0)),
-			       SImode);
-    e.args.quick_push (total_incr);
+    if (m_wrap)
+      {
+	rtx wrap = gen_reg_rtx (DImode);
+	emit_insn (gen_rtx_SET (gen_rtx_SUBREG (SImode, wrap, 4),
+				e.args[offset_arg_no + 2]));
+	emit_insn (gen_rtx_SET (gen_rtx_SUBREG (SImode, wrap, 0),
+				GEN_INT (0)));
+	e.args[offset_arg_no + 2] = wrap;
+      }
+    else
+      {
+	rtx incr = e.args[offset_arg_no + 2];
+	rtx total_incr = gen_int_mode (INTVAL (incr)
+				       * GET_MODE_NUNITS (e.vector_mode (0)),
+				       SImode);
+	e.args.quick_push (total_incr);
+      }
 
     /* _wb mode uses the _n builtins and adds code to update the
        offset.  */
@@ -428,18 +444,26 @@  public:
       {
       case PRED_none:
 	/* No predicate.  */
-	code = m_inc_dec
-	  ? code_for_mve_q_u_insn (VIDUPQ, mode)
-	  : code_for_mve_q_u_insn (VDDUPQ, mode);
+	code = m_wrap
+	  ? (m_inc_dec
+	     ? code_for_mve_q_wb_u_insn (VIWDUPQ, mode)
+	     : code_for_mve_q_wb_u_insn (VDWDUPQ, mode))
+	  : (m_inc_dec
+	     ? code_for_mve_q_u_insn (VIDUPQ, mode)
+	     : code_for_mve_q_u_insn (VDDUPQ, mode));
 	insns = e.use_exact_insn (code);
 	break;
 
       case PRED_m:
       case PRED_x:
 	/* "m" or "x" predicate.  */
-	code = m_inc_dec
-	  ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
-	  : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode);
+	code = m_wrap
+	  ? (m_inc_dec
+	     ? code_for_mve_q_m_wb_u_insn (VIWDUPQ_M, mode)
+	     : code_for_mve_q_m_wb_u_insn (VDWDUPQ_M, mode))
+	  : (m_inc_dec
+	     ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
+	     : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode));
 
 	if (e.pred == PRED_m)
 	  insns = e.use_cond_insn (code, 0);
@@ -671,9 +695,11 @@  FUNCTION_WITHOUT_N_NO_F (vcvtnq, VCVTNQ)
 FUNCTION_WITHOUT_N_NO_F (vcvtpq, VCVTPQ)
 FUNCTION (vcvtbq, vcvtxq_impl, (VCVTBQ_F16_F32, VCVTBQ_M_F16_F32, VCVTBQ_F32_F16, VCVTBQ_M_F32_F16))
 FUNCTION (vcvttq, vcvtxq_impl, (VCVTTQ_F16_F32, VCVTTQ_M_F16_F32, VCVTTQ_F32_F16, VCVTTQ_M_F32_F16))
-FUNCTION (vddupq, viddup_impl, (false))
+FUNCTION (vddupq, viddup_impl, (false, false))
 FUNCTION_ONLY_N (vdupq, VDUPQ)
-FUNCTION (vidupq, viddup_impl, (true))
+FUNCTION (vdwdupq, viddup_impl, (false, true))
+FUNCTION (vidupq, viddup_impl, (true, false))
+FUNCTION (viwdupq, viddup_impl, (true, true))
 FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ)
 FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F))
 FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F))
diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def
index ed3048e219a..c5f1e8a197b 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -48,12 +48,14 @@  DEF_MVE_FUNCTION (vctp64q, vctp, none, m_or_none)
 DEF_MVE_FUNCTION (vctp8q, vctp, none, m_or_none)
 DEF_MVE_FUNCTION (vddupq, viddup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (vdupq, unary_n, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vdwdupq, vidwdup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (veorq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
+DEF_MVE_FUNCTION (viwdupq, vidwdup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (vld1q, load, all_integer, none)
 DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h
index 526e0f8ee3a..ed8761318bb 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -68,6 +68,7 @@  extern const function_base *const vcvtq;
 extern const function_base *const vcvttq;
 extern const function_base *const vddupq;
 extern const function_base *const vdupq;
+extern const function_base *const vdwdupq;
 extern const function_base *const veorq;
 extern const function_base *const vfmaq;
 extern const function_base *const vfmasq;
@@ -77,6 +78,7 @@  extern const function_base *const vhcaddq_rot270;
 extern const function_base *const vhcaddq_rot90;
 extern const function_base *const vhsubq;
 extern const function_base *const vidupq;
+extern const function_base *const viwdupq;
 extern const function_base *const vld1q;
 extern const function_base *const vmaxaq;
 extern const function_base *const vmaxavq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index c3da491b9d1..37b0fedc4ff 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -82,24 +82,10 @@ 
 #define vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p) __arm_vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p)
 #define vstrwq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrwq_scatter_shifted_offset(__base, __offset, __value)
 #define vuninitializedq(__v) __arm_vuninitializedq(__v)
-#define vdwdupq_m(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m(__inactive, __a, __b, __imm, __p)
-#define vdwdupq_u8(__a, __b, __imm) __arm_vdwdupq_u8(__a, __b, __imm)
-#define vdwdupq_u32(__a, __b, __imm) __arm_vdwdupq_u32(__a, __b, __imm)
-#define vdwdupq_u16(__a, __b, __imm) __arm_vdwdupq_u16(__a, __b, __imm)
-#define viwdupq_m(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m(__inactive, __a, __b, __imm, __p)
-#define viwdupq_u8(__a, __b, __imm) __arm_viwdupq_u8(__a, __b, __imm)
-#define viwdupq_u32(__a, __b, __imm) __arm_viwdupq_u32(__a, __b, __imm)
-#define viwdupq_u16(__a, __b, __imm) __arm_viwdupq_u16(__a, __b, __imm)
 #define vstrdq_scatter_base_wb(__addr, __offset, __value) __arm_vstrdq_scatter_base_wb(__addr, __offset, __value)
 #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p)
 #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p)
 #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value)
-#define vdwdupq_x_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_u8(__a, __b, __imm, __p)
-#define vdwdupq_x_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_u16(__a, __b, __imm, __p)
-#define vdwdupq_x_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_u32(__a, __b, __imm, __p)
-#define viwdupq_x_u8(__a, __b, __imm, __p) __arm_viwdupq_x_u8(__a, __b, __imm, __p)
-#define viwdupq_x_u16(__a, __b, __imm, __p) __arm_viwdupq_x_u16(__a, __b, __imm, __p)
-#define viwdupq_x_u32(__a, __b, __imm, __p) __arm_viwdupq_x_u32(__a, __b, __imm, __p)
 #define vadciq(__a, __b, __carry_out) __arm_vadciq(__a, __b, __carry_out)
 #define vadciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vadciq_m(__inactive, __a, __b, __carry_out, __p)
 #define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry)
@@ -323,30 +309,6 @@ 
 #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void)
 #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void)
 #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void)
-#define vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
-#define vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
-#define vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
-#define vdwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p) __arm_vdwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p)
-#define vdwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p) __arm_vdwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p)
-#define vdwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p) __arm_vdwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p)
-#define vdwdupq_n_u8(__a, __b,  __imm) __arm_vdwdupq_n_u8(__a, __b,  __imm)
-#define vdwdupq_n_u32(__a, __b,  __imm) __arm_vdwdupq_n_u32(__a, __b,  __imm)
-#define vdwdupq_n_u16(__a, __b,  __imm) __arm_vdwdupq_n_u16(__a, __b,  __imm)
-#define vdwdupq_wb_u8( __a, __b,  __imm) __arm_vdwdupq_wb_u8( __a, __b,  __imm)
-#define vdwdupq_wb_u32( __a, __b,  __imm) __arm_vdwdupq_wb_u32( __a, __b,  __imm)
-#define vdwdupq_wb_u16( __a, __b,  __imm) __arm_vdwdupq_wb_u16( __a, __b,  __imm)
-#define viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
-#define viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
-#define viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
-#define viwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p) __arm_viwdupq_m_wb_u8(__inactive,  __a, __b,  __imm, __p)
-#define viwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p) __arm_viwdupq_m_wb_u32(__inactive,  __a, __b,  __imm, __p)
-#define viwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p) __arm_viwdupq_m_wb_u16(__inactive,  __a, __b,  __imm, __p)
-#define viwdupq_n_u8(__a, __b,  __imm) __arm_viwdupq_n_u8(__a, __b,  __imm)
-#define viwdupq_n_u32(__a, __b,  __imm) __arm_viwdupq_n_u32(__a, __b,  __imm)
-#define viwdupq_n_u16(__a, __b,  __imm) __arm_viwdupq_n_u16(__a, __b,  __imm)
-#define viwdupq_wb_u8( __a, __b,  __imm) __arm_viwdupq_wb_u8( __a, __b,  __imm)
-#define viwdupq_wb_u32( __a, __b,  __imm) __arm_viwdupq_wb_u32( __a, __b,  __imm)
-#define viwdupq_wb_u16( __a, __b,  __imm) __arm_viwdupq_wb_u16( __a, __b,  __imm)
 #define vldrdq_gather_base_wb_s64(__addr, __offset) __arm_vldrdq_gather_base_wb_s64(__addr, __offset)
 #define vldrdq_gather_base_wb_u64(__addr, __offset) __arm_vldrdq_gather_base_wb_u64(__addr, __offset)
 #define vldrdq_gather_base_wb_z_s64(__addr, __offset, __p) __arm_vldrdq_gather_base_wb_z_s64(__addr, __offset, __p)
@@ -367,18 +329,6 @@ 
 #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value)
 #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value)
 #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value)
-#define vdwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u8(__a, __b,  __imm, __p)
-#define vdwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u16(__a, __b,  __imm, __p)
-#define vdwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u32(__a, __b,  __imm, __p)
-#define vdwdupq_x_wb_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u8(__a, __b,  __imm, __p)
-#define vdwdupq_x_wb_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u16(__a, __b,  __imm, __p)
-#define vdwdupq_x_wb_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u32(__a, __b,  __imm, __p)
-#define viwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u8(__a, __b,  __imm, __p)
-#define viwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u16(__a, __b,  __imm, __p)
-#define viwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u32(__a, __b,  __imm, __p)
-#define viwdupq_x_wb_u8(__a, __b,  __imm, __p) __arm_viwdupq_x_wb_u8(__a, __b,  __imm, __p)
-#define viwdupq_x_wb_u16(__a, __b,  __imm, __p) __arm_viwdupq_x_wb_u16(__a, __b,  __imm, __p)
-#define viwdupq_x_wb_u32(__a, __b,  __imm, __p) __arm_viwdupq_x_wb_u32(__a, __b,  __imm, __p)
 #define vadciq_s32(__a, __b,  __carry_out) __arm_vadciq_s32(__a, __b,  __carry_out)
 #define vadciq_u32(__a, __b,  __carry_out) __arm_vadciq_u32(__a, __b,  __carry_out)
 #define vadciq_m_s32(__inactive, __a, __b,  __carry_out, __p) __arm_vadciq_m_s32(__inactive, __a, __b,  __carry_out, __p)
@@ -1672,223 +1622,6 @@  __arm_vstrwq_scatter_shifted_offset_u32 (uint32_t * __base, uint32x4_t __offset,
   __builtin_mve_vstrwq_scatter_shifted_offset_uv4si ((__builtin_neon_si *) __base, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint8x16_t __res =  __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_n_uv16qi (__a, __c, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_n_uv4si (__a, __c, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_n_uv8hi (__a, __c, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint8x16_t __res = __builtin_mve_vdwdupq_n_uv16qi (*__a, __c, __imm);
-  *__a = __builtin_mve_vdwdupq_wb_uv16qi (*__a, __c, __imm);
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint32x4_t __res = __builtin_mve_vdwdupq_n_uv4si (*__a, __c, __imm);
-  *__a = __builtin_mve_vdwdupq_wb_uv4si (*__a, __c, __imm);
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint16x8_t __res = __builtin_mve_vdwdupq_n_uv8hi (*__a, __c, __imm);
-  *__a = __builtin_mve_vdwdupq_wb_uv8hi (*__a, __c, __imm);
-  return __res;
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_n_uv16qi (__a, __c, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_n_uv4si (__a, __c, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_n_uv8hi (__a, __c, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint8x16_t __res = __builtin_mve_viwdupq_n_uv16qi (*__a, __c, __imm);
-  *__a = __builtin_mve_viwdupq_wb_uv16qi (*__a, __c, __imm);
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint32x4_t __res = __builtin_mve_viwdupq_n_uv4si (*__a, __c, __imm);
-  *__a = __builtin_mve_viwdupq_wb_uv4si (*__a, __c, __imm);
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint16x8_t __res = __builtin_mve_viwdupq_n_uv8hi (*__a, __c, __imm);
-  *__a = __builtin_mve_viwdupq_wb_uv8hi (*__a, __c, __imm);
-  return __res;
-}
-
-
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrdq_gather_base_wb_s64 (uint64x2_t * __addr, const int __offset)
@@ -2025,120 +1758,6 @@  __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3
   *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_vdwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
-  uint8x16_t __res = __builtin_mve_vdwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
-  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
-  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  return __builtin_mve_viwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
-  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
-  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p);
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
-  uint64_t __c = ((uint64_t) __b) << 32;
-  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
-  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p);
-  return __res;
-}
-
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vadciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out)
@@ -4131,174 +3750,6 @@  __arm_vstrwq_scatter_shifted_offset (uint32_t * __base, uint32x4_t __offset, uin
  __arm_vstrwq_scatter_shifted_offset_u32 (__base, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_m_n_u8 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_m_n_u32 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_m_n_u16 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_m_wb_u8 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_m_wb_u32 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_m (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_m_wb_u16 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_u8 (uint32_t __a, uint32_t __b, const int __imm)
-{
- return __arm_vdwdupq_n_u8 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_u32 (uint32_t __a, uint32_t __b, const int __imm)
-{
- return __arm_vdwdupq_n_u32 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_u16 (uint32_t __a, uint32_t __b, const int __imm)
-{
- return __arm_vdwdupq_n_u16 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_u8 (uint32_t * __a, uint32_t __b, const int __imm)
-{
- return __arm_vdwdupq_wb_u8 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_u32 (uint32_t * __a, uint32_t __b, const int __imm)
-{
- return __arm_vdwdupq_wb_u32 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm)
-{
- return __arm_vdwdupq_wb_u16 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_m_n_u8 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_m_n_u32 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_m_n_u16 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_m_wb_u8 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_m_wb_u32 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_m (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_m_wb_u16 (__inactive, __a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_u8 (uint32_t __a, uint32_t __b, const int __imm)
-{
- return __arm_viwdupq_n_u8 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_u32 (uint32_t __a, uint32_t __b, const int __imm)
-{
- return __arm_viwdupq_n_u32 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_u16 (uint32_t __a, uint32_t __b, const int __imm)
-{
- return __arm_viwdupq_n_u16 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_u8 (uint32_t * __a, uint32_t __b, const int __imm)
-{
- return __arm_viwdupq_wb_u8 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_u32 (uint32_t * __a, uint32_t __b, const int __imm)
-{
- return __arm_viwdupq_wb_u32 (__a, __b, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm)
-{
- return __arm_viwdupq_wb_u16 (__a, __b, __imm);
-}
-
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_base_wb (uint64x2_t * __addr, const int __offset, int64x2_t __value)
@@ -4355,90 +3806,6 @@  __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_
  __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_x_n_u8 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_x_n_u16 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_x_n_u32 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_x_wb_u8 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_x_wb_u16 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vdwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_vdwdupq_x_wb_u32 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_x_n_u8 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_x_n_u16 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_x_n_u32 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_x_wb_u8 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_x_wb_u16 (__a, __b, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_viwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
-{
- return __arm_viwdupq_x_wb_u32 (__a, __b, __imm, __p);
-}
-
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vadciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out)
@@ -6146,37 +5513,6 @@  extern void *__ARM_undef;
 #endif /* MVE Integer.  */
 
 
-
-#define __arm_vdwdupq_x_u8(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u8 ((uint32_t) __p1, p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_vdwdupq_x_u16(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u16 ((uint32_t) __p1, p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_vdwdupq_x_u32(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_viwdupq_x_u8(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u8 ((uint32_t) __p1, p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_viwdupq_x_u16(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u16 ((uint32_t) __p1, p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_viwdupq_x_u32(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
 #define __arm_vadciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -6279,56 +5615,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
   int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
 
-#define __arm_viwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_viwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16 (__ARM_mve_coerce_i_scalar(__p0, int), p1, (const int) p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, (const int) p2));})
-
-#define __arm_viwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u32 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
-
-#define __arm_viwdupq_u8(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u8 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
-
-#define __arm_vdwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \
-  __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \
-  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
-
-#define __arm_vdwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u16 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
-
-#define __arm_vdwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u32 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
-
-#define __arm_vdwdupq_u8(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u8 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));})
-
 #define __arm_vshlcq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
   int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1, p2, p3), \
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index be3be67a144..72a7e4dc868 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -5219,14 +5219,14 @@  (define_expand "mve_vdwdupq_m_wb_u<mode>"
 ;;
 (define_insn "@mve_<mve_insn>q_m_wb_u<mode>_insn"
   [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
-	(unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0")
-		       (match_operand:SI 3 "s_register_operand" "1")
+	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
+		       (match_operand:SI 3 "s_register_operand" "2")
 		       (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4)
 		       (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg")
 		       (match_operand:<MVE_VPRED> 6 "vpr_register_operand" "Up")]
 	 VIDWDUPQ_M))
-   (set (match_operand:SI 1 "s_register_operand" "=Te")
-	(unspec:SI [(match_dup 2)
+   (set (match_operand:SI 2 "s_register_operand" "=Te")
+	(unspec:SI [(match_dup 1)
 		    (match_dup 3)
 		    (subreg:SI (match_dup 4) 4)
 		    (match_dup 5)
@@ -5234,7 +5234,7 @@  (define_insn "@mve_<mve_insn>q_m_wb_u<mode>_insn"
 	 VIDWDUPQ_M))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;<mve_insn>t.u%#<V_sz_elem>\t%q2, %3, %R4, %5"
+  "vpst\;<mve_insn>t.u%#<V_sz_elem>\t%q1, %3, %R4, %5"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_wb_u<mode>_insn"))
   (set_attr "type" "mve_move")
   (set_attr "length""8")])