diff mbox series

[v2,19/36] arm: [MVE intrinsics] rework vddup vidup

Message ID 20240904132650.2720446-20-christophe.lyon@linaro.org
State New
Headers show
Series arm: [MVE intrinsics] Re-implement more intrinsics | expand

Commit Message

Christophe Lyon Sept. 4, 2024, 1:26 p.m. UTC
Implement vddup and vidup using the new MVE builtins framework.

We generate better code because we take advantage of the two outputs
produced by the v[id]dup instructions.

For instance, before:
	ldr	r3, [r0]
	sub	r2, r3, #8
	str	r2, [r0]
	mov	r2, r3
	vddup.u16	q3, r2, #1

now:
	ldr	r2, [r0]
	vddup.u16	q3, r2, #1
	str	r2, [r0]

2024-08-21  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm-mve-builtins-base.cc (class viddup_impl): New.
	(vddup): New.
	(vidup): New.
	* config/arm/arm-mve-builtins-base.def (vddupq): New.
	(vidupq): New.
	* config/arm/arm-mve-builtins-base.h (vddupq): New.
	(vidupq): New.
	* config/arm/arm_mve.h (vddupq_m): Delete.
	(vddupq_u8): Delete.
	(vddupq_u32): Delete.
	(vddupq_u16): Delete.
	(vidupq_m): Delete.
	(vidupq_u8): Delete.
	(vidupq_u32): Delete.
	(vidupq_u16): Delete.
	(vddupq_x_u8): Delete.
	(vddupq_x_u16): Delete.
	(vddupq_x_u32): Delete.
	(vidupq_x_u8): Delete.
	(vidupq_x_u16): Delete.
	(vidupq_x_u32): Delete.
	(vddupq_m_n_u8): Delete.
	(vddupq_m_n_u32): Delete.
	(vddupq_m_n_u16): Delete.
	(vddupq_m_wb_u8): Delete.
	(vddupq_m_wb_u16): Delete.
	(vddupq_m_wb_u32): Delete.
	(vddupq_n_u8): Delete.
	(vddupq_n_u32): Delete.
	(vddupq_n_u16): Delete.
	(vddupq_wb_u8): Delete.
	(vddupq_wb_u16): Delete.
	(vddupq_wb_u32): Delete.
	(vidupq_m_n_u8): Delete.
	(vidupq_m_n_u32): Delete.
	(vidupq_m_n_u16): Delete.
	(vidupq_m_wb_u8): Delete.
	(vidupq_m_wb_u16): Delete.
	(vidupq_m_wb_u32): Delete.
	(vidupq_n_u8): Delete.
	(vidupq_n_u32): Delete.
	(vidupq_n_u16): Delete.
	(vidupq_wb_u8): Delete.
	(vidupq_wb_u16): Delete.
	(vidupq_wb_u32): Delete.
	(vddupq_x_n_u8): Delete.
	(vddupq_x_n_u16): Delete.
	(vddupq_x_n_u32): Delete.
	(vddupq_x_wb_u8): Delete.
	(vddupq_x_wb_u16): Delete.
	(vddupq_x_wb_u32): Delete.
	(vidupq_x_n_u8): Delete.
	(vidupq_x_n_u16): Delete.
	(vidupq_x_n_u32): Delete.
	(vidupq_x_wb_u8): Delete.
	(vidupq_x_wb_u16): Delete.
	(vidupq_x_wb_u32): Delete.
	(__arm_vddupq_m_n_u8): Delete.
	(__arm_vddupq_m_n_u32): Delete.
	(__arm_vddupq_m_n_u16): Delete.
	(__arm_vddupq_m_wb_u8): Delete.
	(__arm_vddupq_m_wb_u16): Delete.
	(__arm_vddupq_m_wb_u32): Delete.
	(__arm_vddupq_n_u8): Delete.
	(__arm_vddupq_n_u32): Delete.
	(__arm_vddupq_n_u16): Delete.
	(__arm_vidupq_m_n_u8): Delete.
	(__arm_vidupq_m_n_u32): Delete.
	(__arm_vidupq_m_n_u16): Delete.
	(__arm_vidupq_n_u8): Delete.
	(__arm_vidupq_m_wb_u8): Delete.
	(__arm_vidupq_m_wb_u16): Delete.
	(__arm_vidupq_m_wb_u32): Delete.
	(__arm_vidupq_n_u32): Delete.
	(__arm_vidupq_n_u16): Delete.
	(__arm_vidupq_wb_u8): Delete.
	(__arm_vidupq_wb_u16): Delete.
	(__arm_vidupq_wb_u32): Delete.
	(__arm_vddupq_wb_u8): Delete.
	(__arm_vddupq_wb_u16): Delete.
	(__arm_vddupq_wb_u32): Delete.
	(__arm_vddupq_x_n_u8): Delete.
	(__arm_vddupq_x_n_u16): Delete.
	(__arm_vddupq_x_n_u32): Delete.
	(__arm_vddupq_x_wb_u8): Delete.
	(__arm_vddupq_x_wb_u16): Delete.
	(__arm_vddupq_x_wb_u32): Delete.
	(__arm_vidupq_x_n_u8): Delete.
	(__arm_vidupq_x_n_u16): Delete.
	(__arm_vidupq_x_n_u32): Delete.
	(__arm_vidupq_x_wb_u8): Delete.
	(__arm_vidupq_x_wb_u16): Delete.
	(__arm_vidupq_x_wb_u32): Delete.
	(__arm_vddupq_m): Delete.
	(__arm_vddupq_u8): Delete.
	(__arm_vddupq_u32): Delete.
	(__arm_vddupq_u16): Delete.
	(__arm_vidupq_m): Delete.
	(__arm_vidupq_u8): Delete.
	(__arm_vidupq_u32): Delete.
	(__arm_vidupq_u16): Delete.
	(__arm_vddupq_x_u8): Delete.
	(__arm_vddupq_x_u16): Delete.
	(__arm_vddupq_x_u32): Delete.
	(__arm_vidupq_x_u8): Delete.
	(__arm_vidupq_x_u16): Delete.
	(__arm_vidupq_x_u32): Delete.
---
 gcc/config/arm/arm-mve-builtins-base.cc  | 112 ++++
 gcc/config/arm/arm-mve-builtins-base.def |   2 +
 gcc/config/arm/arm-mve-builtins-base.h   |   2 +
 gcc/config/arm/arm_mve.h                 | 676 -----------------------
 4 files changed, 116 insertions(+), 676 deletions(-)

Comments

Richard Earnshaw (lists) Oct. 14, 2024, 5:36 p.m. UTC | #1
On 04/09/2024 14:26, Christophe Lyon wrote:
> Implement vddup and vidup using the new MVE builtins framework.
> 
> We generate better code because we take advantage of the two outputs
> produced by the v[id]dup instructions.
> 
> For instance, before:
> 	ldr	r3, [r0]
> 	sub	r2, r3, #8
> 	str	r2, [r0]
> 	mov	r2, r3
> 	vddup.u16	q3, r2, #1
> 
> now:
> 	ldr	r2, [r0]
> 	vddup.u16	q3, r2, #1
> 	str	r2, [r0]
> 
> 2024-08-21  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/arm-mve-builtins-base.cc (class viddup_impl): New.
> 	(vddup): New.
> 	(vidup): New.
> 	* config/arm/arm-mve-builtins-base.def (vddupq): New.
> 	(vidupq): New.
> 	* config/arm/arm-mve-builtins-base.h (vddupq): New.
> 	(vidupq): New.
> 	* config/arm/arm_mve.h (vddupq_m): Delete.
> 	(vddupq_u8): Delete.
> 	(vddupq_u32): Delete.
> 	(vddupq_u16): Delete.
> 	(vidupq_m): Delete.
> 	(vidupq_u8): Delete.
> 	(vidupq_u32): Delete.
> 	(vidupq_u16): Delete.
> 	(vddupq_x_u8): Delete.
> 	(vddupq_x_u16): Delete.
> 	(vddupq_x_u32): Delete.
> 	(vidupq_x_u8): Delete.
> 	(vidupq_x_u16): Delete.
> 	(vidupq_x_u32): Delete.
> 	(vddupq_m_n_u8): Delete.
> 	(vddupq_m_n_u32): Delete.
> 	(vddupq_m_n_u16): Delete.
> 	(vddupq_m_wb_u8): Delete.
> 	(vddupq_m_wb_u16): Delete.
> 	(vddupq_m_wb_u32): Delete.
> 	(vddupq_n_u8): Delete.
> 	(vddupq_n_u32): Delete.
> 	(vddupq_n_u16): Delete.
> 	(vddupq_wb_u8): Delete.
> 	(vddupq_wb_u16): Delete.
> 	(vddupq_wb_u32): Delete.
> 	(vidupq_m_n_u8): Delete.
> 	(vidupq_m_n_u32): Delete.
> 	(vidupq_m_n_u16): Delete.
> 	(vidupq_m_wb_u8): Delete.
> 	(vidupq_m_wb_u16): Delete.
> 	(vidupq_m_wb_u32): Delete.
> 	(vidupq_n_u8): Delete.
> 	(vidupq_n_u32): Delete.
> 	(vidupq_n_u16): Delete.
> 	(vidupq_wb_u8): Delete.
> 	(vidupq_wb_u16): Delete.
> 	(vidupq_wb_u32): Delete.
> 	(vddupq_x_n_u8): Delete.
> 	(vddupq_x_n_u16): Delete.
> 	(vddupq_x_n_u32): Delete.
> 	(vddupq_x_wb_u8): Delete.
> 	(vddupq_x_wb_u16): Delete.
> 	(vddupq_x_wb_u32): Delete.
> 	(vidupq_x_n_u8): Delete.
> 	(vidupq_x_n_u16): Delete.
> 	(vidupq_x_n_u32): Delete.
> 	(vidupq_x_wb_u8): Delete.
> 	(vidupq_x_wb_u16): Delete.
> 	(vidupq_x_wb_u32): Delete.
> 	(__arm_vddupq_m_n_u8): Delete.
> 	(__arm_vddupq_m_n_u32): Delete.
> 	(__arm_vddupq_m_n_u16): Delete.
> 	(__arm_vddupq_m_wb_u8): Delete.
> 	(__arm_vddupq_m_wb_u16): Delete.
> 	(__arm_vddupq_m_wb_u32): Delete.
> 	(__arm_vddupq_n_u8): Delete.
> 	(__arm_vddupq_n_u32): Delete.
> 	(__arm_vddupq_n_u16): Delete.
> 	(__arm_vidupq_m_n_u8): Delete.
> 	(__arm_vidupq_m_n_u32): Delete.
> 	(__arm_vidupq_m_n_u16): Delete.
> 	(__arm_vidupq_n_u8): Delete.
> 	(__arm_vidupq_m_wb_u8): Delete.
> 	(__arm_vidupq_m_wb_u16): Delete.
> 	(__arm_vidupq_m_wb_u32): Delete.
> 	(__arm_vidupq_n_u32): Delete.
> 	(__arm_vidupq_n_u16): Delete.
> 	(__arm_vidupq_wb_u8): Delete.
> 	(__arm_vidupq_wb_u16): Delete.
> 	(__arm_vidupq_wb_u32): Delete.
> 	(__arm_vddupq_wb_u8): Delete.
> 	(__arm_vddupq_wb_u16): Delete.
> 	(__arm_vddupq_wb_u32): Delete.
> 	(__arm_vddupq_x_n_u8): Delete.
> 	(__arm_vddupq_x_n_u16): Delete.
> 	(__arm_vddupq_x_n_u32): Delete.
> 	(__arm_vddupq_x_wb_u8): Delete.
> 	(__arm_vddupq_x_wb_u16): Delete.
> 	(__arm_vddupq_x_wb_u32): Delete.
> 	(__arm_vidupq_x_n_u8): Delete.
> 	(__arm_vidupq_x_n_u16): Delete.
> 	(__arm_vidupq_x_n_u32): Delete.
> 	(__arm_vidupq_x_wb_u8): Delete.
> 	(__arm_vidupq_x_wb_u16): Delete.
> 	(__arm_vidupq_x_wb_u32): Delete.
> 	(__arm_vddupq_m): Delete.
> 	(__arm_vddupq_u8): Delete.
> 	(__arm_vddupq_u32): Delete.
> 	(__arm_vddupq_u16): Delete.
> 	(__arm_vidupq_m): Delete.
> 	(__arm_vidupq_u8): Delete.
> 	(__arm_vidupq_u32): Delete.
> 	(__arm_vidupq_u16): Delete.
> 	(__arm_vddupq_x_u8): Delete.
> 	(__arm_vddupq_x_u16): Delete.
> 	(__arm_vddupq_x_u32): Delete.
> 	(__arm_vidupq_x_u8): Delete.
> 	(__arm_vidupq_x_u16): Delete.
> 	(__arm_vidupq_x_u32): Delete.

OK.

R.

> ---
>  gcc/config/arm/arm-mve-builtins-base.cc  | 112 ++++
>  gcc/config/arm/arm-mve-builtins-base.def |   2 +
>  gcc/config/arm/arm-mve-builtins-base.h   |   2 +
>  gcc/config/arm/arm_mve.h                 | 676 -----------------------
>  4 files changed, 116 insertions(+), 676 deletions(-)
> 
> diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc
> index 89724320d43..3d8bcdabe24 100644
> --- a/gcc/config/arm/arm-mve-builtins-base.cc
> +++ b/gcc/config/arm/arm-mve-builtins-base.cc
> @@ -30,6 +30,7 @@
>  #include "basic-block.h"
>  #include "function.h"
>  #include "gimple.h"
> +#include "emit-rtl.h"
>  #include "arm-mve-builtins.h"
>  #include "arm-mve-builtins-shapes.h"
>  #include "arm-mve-builtins-base.h"
> @@ -349,6 +350,115 @@ public:
>    }
>  };
>  
> +/* Map the vidup / vddup function directly to CODE (UNSPEC, M) where M is the
> +   vector mode associated with type suffix 0.  We need this special case
> +   because in MODE_wb the builtins derefrence the first parameter and update
> +   its contents.  We also have to insert the two additional parameters needed
> +   by the builtins compared to the intrinsics.  */
> +class viddup_impl : public function_base
> +{
> +public:
> +  CONSTEXPR viddup_impl (bool inc_dec)
> +    : m_inc_dec (inc_dec)
> +  {}
> +
> +  /* Increment (true) or decrement (false).  */
> +  bool m_inc_dec;
> +
> +  unsigned int
> +  call_properties (const function_instance &fi) const override
> +  {
> +    if (fi.mode_suffix_id == MODE_wb)
> +      return CP_WRITE_MEMORY | CP_READ_MEMORY;
> +    else
> +      return 0;
> +  }
> +
> +  tree
> +  memory_scalar_type (const function_instance &) const override
> +  {
> +    return get_typenode_from_name (UINT32_TYPE);
> +  }
> +
> +  rtx
> +  expand (function_expander &e) const override
> +  {
> +    machine_mode mode = e.vector_mode (0);
> +    insn_code code;
> +    rtx insns, offset_ptr;
> +    rtx new_offset;
> +    int offset_arg_no;
> +    rtx incr, total_incr;
> +
> +    if (! e.type_suffix (0).integer_p)
> +      gcc_unreachable ();
> +
> +    if ((e.mode_suffix_id != MODE_n)
> +	&& (e.mode_suffix_id != MODE_wb))
> +      gcc_unreachable ();
> +
> +    offset_arg_no = (e.pred == PRED_m) ? 1 : 0;
> +
> +    /* In _wb mode, the start offset is passed via a pointer,
> +       dereference it.  */
> +    if (e.mode_suffix_id == MODE_wb)
> +      {
> +	rtx offset = gen_reg_rtx (SImode);
> +	offset_ptr = e.args[offset_arg_no];
> +	emit_insn (gen_rtx_SET (offset, gen_rtx_MEM (SImode, offset_ptr)));
> +	e.args[offset_arg_no] = offset;
> +      }
> +
> +    /* We have to shuffle parameters because the builtin needs additional
> +       arguments:
> +       - the updated "new_offset"
> +       - total increment (incr * number of lanes)  */
> +    new_offset = gen_reg_rtx (SImode);
> +    e.args.quick_insert (offset_arg_no, new_offset);
> +
> +    incr = e.args[offset_arg_no + 2];
> +    total_incr = gen_int_mode (INTVAL (incr)
> +			       * GET_MODE_NUNITS (e.vector_mode (0)),
> +			       SImode);
> +    e.args.quick_push (total_incr);
> +
> +    /* _wb mode uses the _n builtins and adds code to update the
> +       offset.  */
> +    switch (e.pred)
> +      {
> +      case PRED_none:
> +	/* No predicate.  */
> +	code = m_inc_dec
> +	  ? code_for_mve_q_u_insn (VIDUPQ, mode)
> +	  : code_for_mve_q_u_insn (VDDUPQ, mode);
> +	insns = e.use_exact_insn (code);
> +	break;
> +
> +      case PRED_m:
> +      case PRED_x:
> +	/* "m" or "x" predicate.  */
> +	code = m_inc_dec
> +	  ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
> +	  : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode);
> +
> +	if (e.pred == PRED_m)
> +	  insns = e.use_cond_insn (code, 0);
> +	else
> +	  insns = e.use_pred_x_insn (code);
> +	break;
> +
> +      default:
> +	gcc_unreachable ();
> +      }
> +
> +    /* Update offset as appropriate.  */
> +    if (e.mode_suffix_id == MODE_wb)
> +      emit_insn (gen_rtx_SET (gen_rtx_MEM (Pmode, offset_ptr), new_offset));
> +
> +    return insns;
> +  }
> +};
> +
>  } /* end anonymous namespace */
>  
>  namespace arm_mve {
> @@ -561,7 +671,9 @@ FUNCTION_WITHOUT_N_NO_F (vcvtnq, VCVTNQ)
>  FUNCTION_WITHOUT_N_NO_F (vcvtpq, VCVTPQ)
>  FUNCTION (vcvtbq, vcvtxq_impl, (VCVTBQ_F16_F32, VCVTBQ_M_F16_F32, VCVTBQ_F32_F16, VCVTBQ_M_F32_F16))
>  FUNCTION (vcvttq, vcvtxq_impl, (VCVTTQ_F16_F32, VCVTTQ_M_F16_F32, VCVTTQ_F32_F16, VCVTTQ_M_F32_F16))
> +FUNCTION (vddupq, viddup_impl, (false))
>  FUNCTION_ONLY_N (vdupq, VDUPQ)
> +FUNCTION (vidupq, viddup_impl, (true))
>  FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ)
>  FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F))
>  FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F))
> diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def
> index dd46d882882..ed3048e219a 100644
> --- a/gcc/config/arm/arm-mve-builtins-base.def
> +++ b/gcc/config/arm/arm-mve-builtins-base.def
> @@ -46,12 +46,14 @@ DEF_MVE_FUNCTION (vctp16q, vctp, none, m_or_none)
>  DEF_MVE_FUNCTION (vctp32q, vctp, none, m_or_none)
>  DEF_MVE_FUNCTION (vctp64q, vctp, none, m_or_none)
>  DEF_MVE_FUNCTION (vctp8q, vctp, none, m_or_none)
> +DEF_MVE_FUNCTION (vddupq, viddup, all_unsigned, mx_or_none)
>  DEF_MVE_FUNCTION (vdupq, unary_n, all_integer, mx_or_none)
>  DEF_MVE_FUNCTION (veorq, binary, all_integer, mx_or_none)
>  DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none)
>  DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none)
>  DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none)
>  DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
> +DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
>  DEF_MVE_FUNCTION (vld1q, load, all_integer, none)
>  DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none)
>  DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none)
> diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h
> index 41fcf666b11..526e0f8ee3a 100644
> --- a/gcc/config/arm/arm-mve-builtins-base.h
> +++ b/gcc/config/arm/arm-mve-builtins-base.h
> @@ -66,6 +66,7 @@ extern const function_base *const vcvtnq;
>  extern const function_base *const vcvtpq;
>  extern const function_base *const vcvtq;
>  extern const function_base *const vcvttq;
> +extern const function_base *const vddupq;
>  extern const function_base *const vdupq;
>  extern const function_base *const veorq;
>  extern const function_base *const vfmaq;
> @@ -75,6 +76,7 @@ extern const function_base *const vhaddq;
>  extern const function_base *const vhcaddq_rot270;
>  extern const function_base *const vhcaddq_rot90;
>  extern const function_base *const vhsubq;
> +extern const function_base *const vidupq;
>  extern const function_base *const vld1q;
>  extern const function_base *const vmaxaq;
>  extern const function_base *const vmaxavq;
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index 49c4ea9afee..c3da491b9d1 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -82,18 +82,10 @@
>  #define vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p) __arm_vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p)
>  #define vstrwq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrwq_scatter_shifted_offset(__base, __offset, __value)
>  #define vuninitializedq(__v) __arm_vuninitializedq(__v)
> -#define vddupq_m(__inactive, __a, __imm, __p) __arm_vddupq_m(__inactive, __a, __imm, __p)
> -#define vddupq_u8(__a, __imm) __arm_vddupq_u8(__a, __imm)
> -#define vddupq_u32(__a, __imm) __arm_vddupq_u32(__a, __imm)
> -#define vddupq_u16(__a, __imm) __arm_vddupq_u16(__a, __imm)
>  #define vdwdupq_m(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m(__inactive, __a, __b, __imm, __p)
>  #define vdwdupq_u8(__a, __b, __imm) __arm_vdwdupq_u8(__a, __b, __imm)
>  #define vdwdupq_u32(__a, __b, __imm) __arm_vdwdupq_u32(__a, __b, __imm)
>  #define vdwdupq_u16(__a, __b, __imm) __arm_vdwdupq_u16(__a, __b, __imm)
> -#define vidupq_m(__inactive, __a, __imm, __p) __arm_vidupq_m(__inactive, __a, __imm, __p)
> -#define vidupq_u8(__a, __imm) __arm_vidupq_u8(__a, __imm)
> -#define vidupq_u32(__a, __imm) __arm_vidupq_u32(__a, __imm)
> -#define vidupq_u16(__a, __imm) __arm_vidupq_u16(__a, __imm)
>  #define viwdupq_m(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m(__inactive, __a, __b, __imm, __p)
>  #define viwdupq_u8(__a, __b, __imm) __arm_viwdupq_u8(__a, __b, __imm)
>  #define viwdupq_u32(__a, __b, __imm) __arm_viwdupq_u32(__a, __b, __imm)
> @@ -102,15 +94,9 @@
>  #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p)
>  #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p)
>  #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value)
> -#define vddupq_x_u8(__a, __imm, __p) __arm_vddupq_x_u8(__a, __imm, __p)
> -#define vddupq_x_u16(__a, __imm, __p) __arm_vddupq_x_u16(__a, __imm, __p)
> -#define vddupq_x_u32(__a, __imm, __p) __arm_vddupq_x_u32(__a, __imm, __p)
>  #define vdwdupq_x_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_u8(__a, __b, __imm, __p)
>  #define vdwdupq_x_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_u16(__a, __b, __imm, __p)
>  #define vdwdupq_x_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_u32(__a, __b, __imm, __p)
> -#define vidupq_x_u8(__a, __imm, __p) __arm_vidupq_x_u8(__a, __imm, __p)
> -#define vidupq_x_u16(__a, __imm, __p) __arm_vidupq_x_u16(__a, __imm, __p)
> -#define vidupq_x_u32(__a, __imm, __p) __arm_vidupq_x_u32(__a, __imm, __p)
>  #define viwdupq_x_u8(__a, __b, __imm, __p) __arm_viwdupq_x_u8(__a, __b, __imm, __p)
>  #define viwdupq_x_u16(__a, __b, __imm, __p) __arm_viwdupq_x_u16(__a, __b, __imm, __p)
>  #define viwdupq_x_u32(__a, __b, __imm, __p) __arm_viwdupq_x_u32(__a, __b, __imm, __p)
> @@ -337,18 +323,6 @@
>  #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void)
>  #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void)
>  #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void)
> -#define vddupq_m_n_u8(__inactive, __a,  __imm, __p) __arm_vddupq_m_n_u8(__inactive, __a,  __imm, __p)
> -#define vddupq_m_n_u32(__inactive, __a,  __imm, __p) __arm_vddupq_m_n_u32(__inactive, __a,  __imm, __p)
> -#define vddupq_m_n_u16(__inactive, __a,  __imm, __p) __arm_vddupq_m_n_u16(__inactive, __a,  __imm, __p)
> -#define vddupq_m_wb_u8(__inactive,  __a,  __imm, __p) __arm_vddupq_m_wb_u8(__inactive,  __a,  __imm, __p)
> -#define vddupq_m_wb_u16(__inactive,  __a,  __imm, __p) __arm_vddupq_m_wb_u16(__inactive,  __a,  __imm, __p)
> -#define vddupq_m_wb_u32(__inactive,  __a,  __imm, __p) __arm_vddupq_m_wb_u32(__inactive,  __a,  __imm, __p)
> -#define vddupq_n_u8(__a,  __imm) __arm_vddupq_n_u8(__a,  __imm)
> -#define vddupq_n_u32(__a,  __imm) __arm_vddupq_n_u32(__a,  __imm)
> -#define vddupq_n_u16(__a,  __imm) __arm_vddupq_n_u16(__a,  __imm)
> -#define vddupq_wb_u8( __a,  __imm) __arm_vddupq_wb_u8( __a,  __imm)
> -#define vddupq_wb_u16( __a,  __imm) __arm_vddupq_wb_u16( __a,  __imm)
> -#define vddupq_wb_u32( __a,  __imm) __arm_vddupq_wb_u32( __a,  __imm)
>  #define vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
>  #define vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
>  #define vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
> @@ -361,18 +335,6 @@
>  #define vdwdupq_wb_u8( __a, __b,  __imm) __arm_vdwdupq_wb_u8( __a, __b,  __imm)
>  #define vdwdupq_wb_u32( __a, __b,  __imm) __arm_vdwdupq_wb_u32( __a, __b,  __imm)
>  #define vdwdupq_wb_u16( __a, __b,  __imm) __arm_vdwdupq_wb_u16( __a, __b,  __imm)
> -#define vidupq_m_n_u8(__inactive, __a,  __imm, __p) __arm_vidupq_m_n_u8(__inactive, __a,  __imm, __p)
> -#define vidupq_m_n_u32(__inactive, __a,  __imm, __p) __arm_vidupq_m_n_u32(__inactive, __a,  __imm, __p)
> -#define vidupq_m_n_u16(__inactive, __a,  __imm, __p) __arm_vidupq_m_n_u16(__inactive, __a,  __imm, __p)
> -#define vidupq_m_wb_u8(__inactive,  __a,  __imm, __p) __arm_vidupq_m_wb_u8(__inactive,  __a,  __imm, __p)
> -#define vidupq_m_wb_u16(__inactive,  __a,  __imm, __p) __arm_vidupq_m_wb_u16(__inactive,  __a,  __imm, __p)
> -#define vidupq_m_wb_u32(__inactive,  __a,  __imm, __p) __arm_vidupq_m_wb_u32(__inactive,  __a,  __imm, __p)
> -#define vidupq_n_u8(__a,  __imm) __arm_vidupq_n_u8(__a,  __imm)
> -#define vidupq_n_u32(__a,  __imm) __arm_vidupq_n_u32(__a,  __imm)
> -#define vidupq_n_u16(__a,  __imm) __arm_vidupq_n_u16(__a,  __imm)
> -#define vidupq_wb_u8( __a,  __imm) __arm_vidupq_wb_u8( __a,  __imm)
> -#define vidupq_wb_u16( __a,  __imm) __arm_vidupq_wb_u16( __a,  __imm)
> -#define vidupq_wb_u32( __a,  __imm) __arm_vidupq_wb_u32( __a,  __imm)
>  #define viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
>  #define viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
>  #define viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
> @@ -405,24 +367,12 @@
>  #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value)
>  #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value)
>  #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value)
> -#define vddupq_x_n_u8(__a,  __imm, __p) __arm_vddupq_x_n_u8(__a,  __imm, __p)
> -#define vddupq_x_n_u16(__a,  __imm, __p) __arm_vddupq_x_n_u16(__a,  __imm, __p)
> -#define vddupq_x_n_u32(__a,  __imm, __p) __arm_vddupq_x_n_u32(__a,  __imm, __p)
> -#define vddupq_x_wb_u8(__a,  __imm, __p) __arm_vddupq_x_wb_u8(__a,  __imm, __p)
> -#define vddupq_x_wb_u16(__a,  __imm, __p) __arm_vddupq_x_wb_u16(__a,  __imm, __p)
> -#define vddupq_x_wb_u32(__a,  __imm, __p) __arm_vddupq_x_wb_u32(__a,  __imm, __p)
>  #define vdwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u8(__a, __b,  __imm, __p)
>  #define vdwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u16(__a, __b,  __imm, __p)
>  #define vdwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u32(__a, __b,  __imm, __p)
>  #define vdwdupq_x_wb_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u8(__a, __b,  __imm, __p)
>  #define vdwdupq_x_wb_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u16(__a, __b,  __imm, __p)
>  #define vdwdupq_x_wb_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u32(__a, __b,  __imm, __p)
> -#define vidupq_x_n_u8(__a,  __imm, __p) __arm_vidupq_x_n_u8(__a,  __imm, __p)
> -#define vidupq_x_n_u16(__a,  __imm, __p) __arm_vidupq_x_n_u16(__a,  __imm, __p)
> -#define vidupq_x_n_u32(__a,  __imm, __p) __arm_vidupq_x_n_u32(__a,  __imm, __p)
> -#define vidupq_x_wb_u8(__a,  __imm, __p) __arm_vidupq_x_wb_u8(__a,  __imm, __p)
> -#define vidupq_x_wb_u16(__a,  __imm, __p) __arm_vidupq_x_wb_u16(__a,  __imm, __p)
> -#define vidupq_x_wb_u32(__a,  __imm, __p) __arm_vidupq_x_wb_u32(__a,  __imm, __p)
>  #define viwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u8(__a, __b,  __imm, __p)
>  #define viwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u16(__a, __b,  __imm, __p)
>  #define viwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u32(__a, __b,  __imm, __p)
> @@ -1722,75 +1672,6 @@ __arm_vstrwq_scatter_shifted_offset_u32 (uint32_t * __base, uint32x4_t __offset,
>    __builtin_mve_vstrwq_scatter_shifted_offset_uv4si ((__builtin_neon_si *) __base, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vddupq_m_n_uv16qi (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vddupq_m_n_uv4si (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vddupq_m_n_uv8hi (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> -  uint8x16_t __res = __builtin_mve_vddupq_m_n_uv16qi (__inactive, * __a, __imm, __p);
> -  *__a -= __imm * 16u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> -  uint16x8_t __res = __builtin_mve_vddupq_m_n_uv8hi (__inactive, *__a, __imm, __p);
> -  *__a -= __imm * 8u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> -  uint32x4_t __res = __builtin_mve_vddupq_m_n_uv4si (__inactive, *__a, __imm, __p);
> -  *__a -= __imm * 4u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_n_u8 (uint32_t __a, const int __imm)
> -{
> -  return __builtin_mve_vddupq_n_uv16qi (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_n_u32 (uint32_t __a, const int __imm)
> -{
> -  return __builtin_mve_vddupq_n_uv4si (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_n_u16 (uint32_t __a, const int __imm)
> -{
> -  return __builtin_mve_vddupq_n_uv8hi (__a, __imm);
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -1899,129 +1780,6 @@ __arm_vdwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
>    return __res;
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vidupq_m_n_uv16qi (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vidupq_m_n_uv4si (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vidupq_m_n_uv8hi (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_n_u8 (uint32_t __a, const int __imm)
> -{
> -  return __builtin_mve_vidupq_n_uv16qi (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> -  uint8x16_t __res = __builtin_mve_vidupq_m_n_uv16qi (__inactive, *__a, __imm, __p);
> -  *__a += __imm * 16u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> -  uint16x8_t __res = __builtin_mve_vidupq_m_n_uv8hi (__inactive, *__a, __imm, __p);
> -  *__a += __imm * 8u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> -  uint32x4_t __res = __builtin_mve_vidupq_m_n_uv4si (__inactive, *__a, __imm, __p);
> -  *__a += __imm * 4u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_n_u32 (uint32_t __a, const int __imm)
> -{
> -  return __builtin_mve_vidupq_n_uv4si (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_n_u16 (uint32_t __a, const int __imm)
> -{
> -  return __builtin_mve_vidupq_n_uv8hi (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_wb_u8 (uint32_t * __a, const int __imm)
> -{
> -  uint8x16_t __res = __builtin_mve_vidupq_n_uv16qi (*__a, __imm);
> -  *__a += __imm * 16u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_wb_u16 (uint32_t * __a, const int __imm)
> -{
> -  uint16x8_t __res = __builtin_mve_vidupq_n_uv8hi (*__a, __imm);
> -  *__a += __imm * 8u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_wb_u32 (uint32_t * __a, const int __imm)
> -{
> -  uint32x4_t __res = __builtin_mve_vidupq_n_uv4si (*__a, __imm);
> -  *__a += __imm * 4u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_wb_u8 (uint32_t * __a, const int __imm)
> -{
> -  uint8x16_t __res = __builtin_mve_vddupq_n_uv16qi (*__a, __imm);
> -  *__a -= __imm * 16u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_wb_u16 (uint32_t * __a, const int __imm)
> -{
> -  uint16x8_t __res = __builtin_mve_vddupq_n_uv8hi (*__a, __imm);
> -  *__a -= __imm * 8u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_wb_u32 (uint32_t * __a, const int __imm)
> -{
> -  uint32x4_t __res = __builtin_mve_vddupq_n_uv4si (*__a, __imm);
> -  *__a -= __imm * 4u;
> -  return __res;
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_viwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -2267,57 +2025,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3
>    *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_n_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vddupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_n_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vddupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_n_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vddupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_wb_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> -  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
> -  uint8x16_t __res = __builtin_mve_vddupq_m_n_uv16qi (__arg1, * __a, __imm, __p);
> -  *__a -= __imm * 16u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_wb_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> -  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
> -  uint16x8_t __res = __builtin_mve_vddupq_m_n_uv8hi (__arg1, *__a, __imm, __p);
> -  *__a -= __imm * 8u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_wb_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> -  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
> -  uint32x4_t __res = __builtin_mve_vddupq_m_n_uv4si (__arg1, *__a, __imm, __p);
> -  *__a -= __imm * 4u;
> -  return __res;
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vdwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -2375,57 +2082,6 @@ __arm_vdwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16
>    return __res;
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_n_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vidupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_n_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vidupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_n_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> -  return __builtin_mve_vidupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_wb_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> -  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
> -  uint8x16_t __res = __builtin_mve_vidupq_m_n_uv16qi (__arg1, *__a, __imm, __p);
> -  *__a += __imm * 16u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_wb_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> -  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
> -  uint16x8_t __res = __builtin_mve_vidupq_m_n_uv8hi (__arg1, *__a, __imm, __p);
> -  *__a += __imm * 8u;
> -  return __res;
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_wb_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> -  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
> -  uint32x4_t __res = __builtin_mve_vidupq_m_n_uv4si (__arg1, *__a, __imm, __p);
> -  *__a += __imm * 4u;
> -  return __res;
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_viwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -4475,69 +4131,6 @@ __arm_vstrwq_scatter_shifted_offset (uint32_t * __base, uint32x4_t __offset, uin
>   __arm_vstrwq_scatter_shifted_offset_u32 (__base, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_m_n_u8 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_m_n_u32 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_m_n_u16 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_m_wb_u8 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_m_wb_u16 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_m (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_m_wb_u32 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_u8 (uint32_t __a, const int __imm)
> -{
> - return __arm_vddupq_n_u8 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_u32 (uint32_t __a, const int __imm)
> -{
> - return __arm_vddupq_n_u32 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_u16 (uint32_t __a, const int __imm)
> -{
> - return __arm_vddupq_n_u16 (__a, __imm);
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vdwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -4622,111 +4215,6 @@ __arm_vdwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm)
>   return __arm_vdwdupq_wb_u16 (__a, __b, __imm);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_m_n_u8 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_m_n_u32 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_m_n_u16 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_u8 (uint32_t __a, const int __imm)
> -{
> - return __arm_vidupq_n_u8 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_m_wb_u8 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_m_wb_u16 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_m (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_m_wb_u32 (__inactive, __a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_u32 (uint32_t __a, const int __imm)
> -{
> - return __arm_vidupq_n_u32 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_u16 (uint32_t __a, const int __imm)
> -{
> - return __arm_vidupq_n_u16 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_u8 (uint32_t * __a, const int __imm)
> -{
> - return __arm_vidupq_wb_u8 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_u16 (uint32_t * __a, const int __imm)
> -{
> - return __arm_vidupq_wb_u16 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_u32 (uint32_t * __a, const int __imm)
> -{
> - return __arm_vidupq_wb_u32 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_u8 (uint32_t * __a, const int __imm)
> -{
> - return __arm_vddupq_wb_u8 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_u16 (uint32_t * __a, const int __imm)
> -{
> - return __arm_vddupq_wb_u16 (__a, __imm);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_u32 (uint32_t * __a, const int __imm)
> -{
> - return __arm_vddupq_wb_u32 (__a, __imm);
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_viwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -4867,48 +4355,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_
>   __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_x_n_u8 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_x_n_u16 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_x_n_u32 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_x_wb_u8 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_x_wb_u16 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vddupq_x_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vddupq_x_wb_u32 (__a, __imm, __p);
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_vdwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -4951,48 +4397,6 @@ __arm_vdwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t
>   return __arm_vdwdupq_x_wb_u32 (__a, __b, __imm, __p);
>  }
>  
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_x_n_u8 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_x_n_u16 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_x_n_u32 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint8x16_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_x_wb_u8 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint16x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_x_wb_u16 (__a, __imm, __p);
> -}
> -
> -__extension__ extern __inline uint32x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__arm_vidupq_x_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
> -{
> - return __arm_vidupq_x_wb_u32 (__a, __imm, __p);
> -}
> -
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  __arm_viwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
> @@ -6773,36 +6177,6 @@ extern void *__ARM_undef;
>    int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \
>    int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
>  
> -#define __arm_vidupq_x_u8(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u8 ((uint32_t) __p1, p2, p3), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vddupq_x_u8(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u8 ((uint32_t) __p1, p2, p3), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vidupq_x_u16(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u16 ((uint32_t) __p1, p2, p3), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vddupq_x_u16(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u16 ((uint32_t) __p1, p2, p3), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vidupq_x_u32(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u32 ((uint32_t) __p1, p2, p3), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vddupq_x_u32(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u32 ((uint32_t) __p1, p2, p3), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
>  #define __arm_vadciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
>    __typeof(p1) __p1 = (p1); \
>    _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
> @@ -6905,56 +6279,6 @@ extern void *__ARM_undef;
>    int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
>    int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
>  
> -#define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
> - __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
> - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), (uint32_t) __p1, p2, p3), \
> - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), (uint32_t) __p1, p2, p3), \
> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), (uint32_t) __p1, p2, p3), \
> - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
> - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vddupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
> - __typeof(p1) __p1 = (p1); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
> - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), (uint32_t) __p1, p2, p3), \
> - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), (uint32_t) __p1, p2, p3), \
> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), (uint32_t) __p1, p2, p3), \
> - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
> - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
> - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
> -
> -#define __arm_vidupq_u16(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u16 ((uint32_t) __p0, p1), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
> -
> -#define __arm_vidupq_u32(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u32 ((uint32_t) __p0, p1), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
> -
> -#define __arm_vidupq_u8(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u8 ((uint32_t) __p0, p1), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
> -
> -#define __arm_vddupq_u16(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u16 ((uint32_t) __p0, p1), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
> -
> -#define __arm_vddupq_u32(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u32 ((uint32_t) __p0, p1), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
> -
> -#define __arm_vddupq_u8(p0,p1) ({ __typeof(p0) __p0 = (p0); \
> -  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
> -  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u8 ((uint32_t) __p0, p1), \
> -  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
> -
>  #define __arm_viwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \
>    __typeof(p1) __p1 = (p1); \
>    _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
diff mbox series

Patch

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc
index 89724320d43..3d8bcdabe24 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -30,6 +30,7 @@ 
 #include "basic-block.h"
 #include "function.h"
 #include "gimple.h"
+#include "emit-rtl.h"
 #include "arm-mve-builtins.h"
 #include "arm-mve-builtins-shapes.h"
 #include "arm-mve-builtins-base.h"
@@ -349,6 +350,115 @@  public:
   }
 };
 
+/* Map the vidup / vddup function directly to CODE (UNSPEC, M) where M is the
+   vector mode associated with type suffix 0.  We need this special case
+   because in MODE_wb the builtins derefrence the first parameter and update
+   its contents.  We also have to insert the two additional parameters needed
+   by the builtins compared to the intrinsics.  */
+class viddup_impl : public function_base
+{
+public:
+  CONSTEXPR viddup_impl (bool inc_dec)
+    : m_inc_dec (inc_dec)
+  {}
+
+  /* Increment (true) or decrement (false).  */
+  bool m_inc_dec;
+
+  unsigned int
+  call_properties (const function_instance &fi) const override
+  {
+    if (fi.mode_suffix_id == MODE_wb)
+      return CP_WRITE_MEMORY | CP_READ_MEMORY;
+    else
+      return 0;
+  }
+
+  tree
+  memory_scalar_type (const function_instance &) const override
+  {
+    return get_typenode_from_name (UINT32_TYPE);
+  }
+
+  rtx
+  expand (function_expander &e) const override
+  {
+    machine_mode mode = e.vector_mode (0);
+    insn_code code;
+    rtx insns, offset_ptr;
+    rtx new_offset;
+    int offset_arg_no;
+    rtx incr, total_incr;
+
+    if (! e.type_suffix (0).integer_p)
+      gcc_unreachable ();
+
+    if ((e.mode_suffix_id != MODE_n)
+	&& (e.mode_suffix_id != MODE_wb))
+      gcc_unreachable ();
+
+    offset_arg_no = (e.pred == PRED_m) ? 1 : 0;
+
+    /* In _wb mode, the start offset is passed via a pointer,
+       dereference it.  */
+    if (e.mode_suffix_id == MODE_wb)
+      {
+	rtx offset = gen_reg_rtx (SImode);
+	offset_ptr = e.args[offset_arg_no];
+	emit_insn (gen_rtx_SET (offset, gen_rtx_MEM (SImode, offset_ptr)));
+	e.args[offset_arg_no] = offset;
+      }
+
+    /* We have to shuffle parameters because the builtin needs additional
+       arguments:
+       - the updated "new_offset"
+       - total increment (incr * number of lanes)  */
+    new_offset = gen_reg_rtx (SImode);
+    e.args.quick_insert (offset_arg_no, new_offset);
+
+    incr = e.args[offset_arg_no + 2];
+    total_incr = gen_int_mode (INTVAL (incr)
+			       * GET_MODE_NUNITS (e.vector_mode (0)),
+			       SImode);
+    e.args.quick_push (total_incr);
+
+    /* _wb mode uses the _n builtins and adds code to update the
+       offset.  */
+    switch (e.pred)
+      {
+      case PRED_none:
+	/* No predicate.  */
+	code = m_inc_dec
+	  ? code_for_mve_q_u_insn (VIDUPQ, mode)
+	  : code_for_mve_q_u_insn (VDDUPQ, mode);
+	insns = e.use_exact_insn (code);
+	break;
+
+      case PRED_m:
+      case PRED_x:
+	/* "m" or "x" predicate.  */
+	code = m_inc_dec
+	  ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
+	  : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode);
+
+	if (e.pred == PRED_m)
+	  insns = e.use_cond_insn (code, 0);
+	else
+	  insns = e.use_pred_x_insn (code);
+	break;
+
+      default:
+	gcc_unreachable ();
+      }
+
+    /* Update offset as appropriate.  */
+    if (e.mode_suffix_id == MODE_wb)
+      emit_insn (gen_rtx_SET (gen_rtx_MEM (Pmode, offset_ptr), new_offset));
+
+    return insns;
+  }
+};
+
 } /* end anonymous namespace */
 
 namespace arm_mve {
@@ -561,7 +671,9 @@  FUNCTION_WITHOUT_N_NO_F (vcvtnq, VCVTNQ)
 FUNCTION_WITHOUT_N_NO_F (vcvtpq, VCVTPQ)
 FUNCTION (vcvtbq, vcvtxq_impl, (VCVTBQ_F16_F32, VCVTBQ_M_F16_F32, VCVTBQ_F32_F16, VCVTBQ_M_F32_F16))
 FUNCTION (vcvttq, vcvtxq_impl, (VCVTTQ_F16_F32, VCVTTQ_M_F16_F32, VCVTTQ_F32_F16, VCVTTQ_M_F32_F16))
+FUNCTION (vddupq, viddup_impl, (false))
 FUNCTION_ONLY_N (vdupq, VDUPQ)
+FUNCTION (vidupq, viddup_impl, (true))
 FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ)
 FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F))
 FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F))
diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def
index dd46d882882..ed3048e219a 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -46,12 +46,14 @@  DEF_MVE_FUNCTION (vctp16q, vctp, none, m_or_none)
 DEF_MVE_FUNCTION (vctp32q, vctp, none, m_or_none)
 DEF_MVE_FUNCTION (vctp64q, vctp, none, m_or_none)
 DEF_MVE_FUNCTION (vctp8q, vctp, none, m_or_none)
+DEF_MVE_FUNCTION (vddupq, viddup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (vdupq, unary_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (veorq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
 DEF_MVE_FUNCTION (vld1q, load, all_integer, none)
 DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none)
 DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h
index 41fcf666b11..526e0f8ee3a 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -66,6 +66,7 @@  extern const function_base *const vcvtnq;
 extern const function_base *const vcvtpq;
 extern const function_base *const vcvtq;
 extern const function_base *const vcvttq;
+extern const function_base *const vddupq;
 extern const function_base *const vdupq;
 extern const function_base *const veorq;
 extern const function_base *const vfmaq;
@@ -75,6 +76,7 @@  extern const function_base *const vhaddq;
 extern const function_base *const vhcaddq_rot270;
 extern const function_base *const vhcaddq_rot90;
 extern const function_base *const vhsubq;
+extern const function_base *const vidupq;
 extern const function_base *const vld1q;
 extern const function_base *const vmaxaq;
 extern const function_base *const vmaxavq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 49c4ea9afee..c3da491b9d1 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -82,18 +82,10 @@ 
 #define vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p) __arm_vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p)
 #define vstrwq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrwq_scatter_shifted_offset(__base, __offset, __value)
 #define vuninitializedq(__v) __arm_vuninitializedq(__v)
-#define vddupq_m(__inactive, __a, __imm, __p) __arm_vddupq_m(__inactive, __a, __imm, __p)
-#define vddupq_u8(__a, __imm) __arm_vddupq_u8(__a, __imm)
-#define vddupq_u32(__a, __imm) __arm_vddupq_u32(__a, __imm)
-#define vddupq_u16(__a, __imm) __arm_vddupq_u16(__a, __imm)
 #define vdwdupq_m(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m(__inactive, __a, __b, __imm, __p)
 #define vdwdupq_u8(__a, __b, __imm) __arm_vdwdupq_u8(__a, __b, __imm)
 #define vdwdupq_u32(__a, __b, __imm) __arm_vdwdupq_u32(__a, __b, __imm)
 #define vdwdupq_u16(__a, __b, __imm) __arm_vdwdupq_u16(__a, __b, __imm)
-#define vidupq_m(__inactive, __a, __imm, __p) __arm_vidupq_m(__inactive, __a, __imm, __p)
-#define vidupq_u8(__a, __imm) __arm_vidupq_u8(__a, __imm)
-#define vidupq_u32(__a, __imm) __arm_vidupq_u32(__a, __imm)
-#define vidupq_u16(__a, __imm) __arm_vidupq_u16(__a, __imm)
 #define viwdupq_m(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m(__inactive, __a, __b, __imm, __p)
 #define viwdupq_u8(__a, __b, __imm) __arm_viwdupq_u8(__a, __b, __imm)
 #define viwdupq_u32(__a, __b, __imm) __arm_viwdupq_u32(__a, __b, __imm)
@@ -102,15 +94,9 @@ 
 #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p)
 #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p)
 #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value)
-#define vddupq_x_u8(__a, __imm, __p) __arm_vddupq_x_u8(__a, __imm, __p)
-#define vddupq_x_u16(__a, __imm, __p) __arm_vddupq_x_u16(__a, __imm, __p)
-#define vddupq_x_u32(__a, __imm, __p) __arm_vddupq_x_u32(__a, __imm, __p)
 #define vdwdupq_x_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_u8(__a, __b, __imm, __p)
 #define vdwdupq_x_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_u16(__a, __b, __imm, __p)
 #define vdwdupq_x_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_u32(__a, __b, __imm, __p)
-#define vidupq_x_u8(__a, __imm, __p) __arm_vidupq_x_u8(__a, __imm, __p)
-#define vidupq_x_u16(__a, __imm, __p) __arm_vidupq_x_u16(__a, __imm, __p)
-#define vidupq_x_u32(__a, __imm, __p) __arm_vidupq_x_u32(__a, __imm, __p)
 #define viwdupq_x_u8(__a, __b, __imm, __p) __arm_viwdupq_x_u8(__a, __b, __imm, __p)
 #define viwdupq_x_u16(__a, __b, __imm, __p) __arm_viwdupq_x_u16(__a, __b, __imm, __p)
 #define viwdupq_x_u32(__a, __b, __imm, __p) __arm_viwdupq_x_u32(__a, __b, __imm, __p)
@@ -337,18 +323,6 @@ 
 #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void)
 #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void)
 #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void)
-#define vddupq_m_n_u8(__inactive, __a,  __imm, __p) __arm_vddupq_m_n_u8(__inactive, __a,  __imm, __p)
-#define vddupq_m_n_u32(__inactive, __a,  __imm, __p) __arm_vddupq_m_n_u32(__inactive, __a,  __imm, __p)
-#define vddupq_m_n_u16(__inactive, __a,  __imm, __p) __arm_vddupq_m_n_u16(__inactive, __a,  __imm, __p)
-#define vddupq_m_wb_u8(__inactive,  __a,  __imm, __p) __arm_vddupq_m_wb_u8(__inactive,  __a,  __imm, __p)
-#define vddupq_m_wb_u16(__inactive,  __a,  __imm, __p) __arm_vddupq_m_wb_u16(__inactive,  __a,  __imm, __p)
-#define vddupq_m_wb_u32(__inactive,  __a,  __imm, __p) __arm_vddupq_m_wb_u32(__inactive,  __a,  __imm, __p)
-#define vddupq_n_u8(__a,  __imm) __arm_vddupq_n_u8(__a,  __imm)
-#define vddupq_n_u32(__a,  __imm) __arm_vddupq_n_u32(__a,  __imm)
-#define vddupq_n_u16(__a,  __imm) __arm_vddupq_n_u16(__a,  __imm)
-#define vddupq_wb_u8( __a,  __imm) __arm_vddupq_wb_u8( __a,  __imm)
-#define vddupq_wb_u16( __a,  __imm) __arm_vddupq_wb_u16( __a,  __imm)
-#define vddupq_wb_u32( __a,  __imm) __arm_vddupq_wb_u32( __a,  __imm)
 #define vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
 #define vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
 #define vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_vdwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
@@ -361,18 +335,6 @@ 
 #define vdwdupq_wb_u8( __a, __b,  __imm) __arm_vdwdupq_wb_u8( __a, __b,  __imm)
 #define vdwdupq_wb_u32( __a, __b,  __imm) __arm_vdwdupq_wb_u32( __a, __b,  __imm)
 #define vdwdupq_wb_u16( __a, __b,  __imm) __arm_vdwdupq_wb_u16( __a, __b,  __imm)
-#define vidupq_m_n_u8(__inactive, __a,  __imm, __p) __arm_vidupq_m_n_u8(__inactive, __a,  __imm, __p)
-#define vidupq_m_n_u32(__inactive, __a,  __imm, __p) __arm_vidupq_m_n_u32(__inactive, __a,  __imm, __p)
-#define vidupq_m_n_u16(__inactive, __a,  __imm, __p) __arm_vidupq_m_n_u16(__inactive, __a,  __imm, __p)
-#define vidupq_m_wb_u8(__inactive,  __a,  __imm, __p) __arm_vidupq_m_wb_u8(__inactive,  __a,  __imm, __p)
-#define vidupq_m_wb_u16(__inactive,  __a,  __imm, __p) __arm_vidupq_m_wb_u16(__inactive,  __a,  __imm, __p)
-#define vidupq_m_wb_u32(__inactive,  __a,  __imm, __p) __arm_vidupq_m_wb_u32(__inactive,  __a,  __imm, __p)
-#define vidupq_n_u8(__a,  __imm) __arm_vidupq_n_u8(__a,  __imm)
-#define vidupq_n_u32(__a,  __imm) __arm_vidupq_n_u32(__a,  __imm)
-#define vidupq_n_u16(__a,  __imm) __arm_vidupq_n_u16(__a,  __imm)
-#define vidupq_wb_u8( __a,  __imm) __arm_vidupq_wb_u8( __a,  __imm)
-#define vidupq_wb_u16( __a,  __imm) __arm_vidupq_wb_u16( __a,  __imm)
-#define vidupq_wb_u32( __a,  __imm) __arm_vidupq_wb_u32( __a,  __imm)
 #define viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u8(__inactive, __a, __b,  __imm, __p)
 #define viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u32(__inactive, __a, __b,  __imm, __p)
 #define viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p) __arm_viwdupq_m_n_u16(__inactive, __a, __b,  __imm, __p)
@@ -405,24 +367,12 @@ 
 #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value)
 #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value)
 #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value)
-#define vddupq_x_n_u8(__a,  __imm, __p) __arm_vddupq_x_n_u8(__a,  __imm, __p)
-#define vddupq_x_n_u16(__a,  __imm, __p) __arm_vddupq_x_n_u16(__a,  __imm, __p)
-#define vddupq_x_n_u32(__a,  __imm, __p) __arm_vddupq_x_n_u32(__a,  __imm, __p)
-#define vddupq_x_wb_u8(__a,  __imm, __p) __arm_vddupq_x_wb_u8(__a,  __imm, __p)
-#define vddupq_x_wb_u16(__a,  __imm, __p) __arm_vddupq_x_wb_u16(__a,  __imm, __p)
-#define vddupq_x_wb_u32(__a,  __imm, __p) __arm_vddupq_x_wb_u32(__a,  __imm, __p)
 #define vdwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u8(__a, __b,  __imm, __p)
 #define vdwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u16(__a, __b,  __imm, __p)
 #define vdwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_n_u32(__a, __b,  __imm, __p)
 #define vdwdupq_x_wb_u8(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u8(__a, __b,  __imm, __p)
 #define vdwdupq_x_wb_u16(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u16(__a, __b,  __imm, __p)
 #define vdwdupq_x_wb_u32(__a, __b,  __imm, __p) __arm_vdwdupq_x_wb_u32(__a, __b,  __imm, __p)
-#define vidupq_x_n_u8(__a,  __imm, __p) __arm_vidupq_x_n_u8(__a,  __imm, __p)
-#define vidupq_x_n_u16(__a,  __imm, __p) __arm_vidupq_x_n_u16(__a,  __imm, __p)
-#define vidupq_x_n_u32(__a,  __imm, __p) __arm_vidupq_x_n_u32(__a,  __imm, __p)
-#define vidupq_x_wb_u8(__a,  __imm, __p) __arm_vidupq_x_wb_u8(__a,  __imm, __p)
-#define vidupq_x_wb_u16(__a,  __imm, __p) __arm_vidupq_x_wb_u16(__a,  __imm, __p)
-#define vidupq_x_wb_u32(__a,  __imm, __p) __arm_vidupq_x_wb_u32(__a,  __imm, __p)
 #define viwdupq_x_n_u8(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u8(__a, __b,  __imm, __p)
 #define viwdupq_x_n_u16(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u16(__a, __b,  __imm, __p)
 #define viwdupq_x_n_u32(__a, __b,  __imm, __p) __arm_viwdupq_x_n_u32(__a, __b,  __imm, __p)
@@ -1722,75 +1672,6 @@  __arm_vstrwq_scatter_shifted_offset_u32 (uint32_t * __base, uint32x4_t __offset,
   __builtin_mve_vstrwq_scatter_shifted_offset_uv4si ((__builtin_neon_si *) __base, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vddupq_m_n_uv16qi (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vddupq_m_n_uv4si (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vddupq_m_n_uv8hi (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
-  uint8x16_t __res = __builtin_mve_vddupq_m_n_uv16qi (__inactive, * __a, __imm, __p);
-  *__a -= __imm * 16u;
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
-  uint16x8_t __res = __builtin_mve_vddupq_m_n_uv8hi (__inactive, *__a, __imm, __p);
-  *__a -= __imm * 8u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
-  uint32x4_t __res = __builtin_mve_vddupq_m_n_uv4si (__inactive, *__a, __imm, __p);
-  *__a -= __imm * 4u;
-  return __res;
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_n_u8 (uint32_t __a, const int __imm)
-{
-  return __builtin_mve_vddupq_n_uv16qi (__a, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_n_u32 (uint32_t __a, const int __imm)
-{
-  return __builtin_mve_vddupq_n_uv4si (__a, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_n_u16 (uint32_t __a, const int __imm)
-{
-  return __builtin_mve_vddupq_n_uv8hi (__a, __imm);
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -1899,129 +1780,6 @@  __arm_vdwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
   return __res;
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vidupq_m_n_uv16qi (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vidupq_m_n_uv4si (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vidupq_m_n_uv8hi (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_n_u8 (uint32_t __a, const int __imm)
-{
-  return __builtin_mve_vidupq_n_uv16qi (__a, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
-  uint8x16_t __res = __builtin_mve_vidupq_m_n_uv16qi (__inactive, *__a, __imm, __p);
-  *__a += __imm * 16u;
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
-  uint16x8_t __res = __builtin_mve_vidupq_m_n_uv8hi (__inactive, *__a, __imm, __p);
-  *__a += __imm * 8u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
-  uint32x4_t __res = __builtin_mve_vidupq_m_n_uv4si (__inactive, *__a, __imm, __p);
-  *__a += __imm * 4u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_n_u32 (uint32_t __a, const int __imm)
-{
-  return __builtin_mve_vidupq_n_uv4si (__a, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_n_u16 (uint32_t __a, const int __imm)
-{
-  return __builtin_mve_vidupq_n_uv8hi (__a, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_wb_u8 (uint32_t * __a, const int __imm)
-{
-  uint8x16_t __res = __builtin_mve_vidupq_n_uv16qi (*__a, __imm);
-  *__a += __imm * 16u;
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_wb_u16 (uint32_t * __a, const int __imm)
-{
-  uint16x8_t __res = __builtin_mve_vidupq_n_uv8hi (*__a, __imm);
-  *__a += __imm * 8u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_wb_u32 (uint32_t * __a, const int __imm)
-{
-  uint32x4_t __res = __builtin_mve_vidupq_n_uv4si (*__a, __imm);
-  *__a += __imm * 4u;
-  return __res;
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_wb_u8 (uint32_t * __a, const int __imm)
-{
-  uint8x16_t __res = __builtin_mve_vddupq_n_uv16qi (*__a, __imm);
-  *__a -= __imm * 16u;
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_wb_u16 (uint32_t * __a, const int __imm)
-{
-  uint16x8_t __res = __builtin_mve_vddupq_n_uv8hi (*__a, __imm);
-  *__a -= __imm * 8u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_wb_u32 (uint32_t * __a, const int __imm)
-{
-  uint32x4_t __res = __builtin_mve_vddupq_n_uv4si (*__a, __imm);
-  *__a -= __imm * 4u;
-  return __res;
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -2267,57 +2025,6 @@  __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3
   *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_n_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vddupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_n_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vddupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_n_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vddupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_wb_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
-  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
-  uint8x16_t __res = __builtin_mve_vddupq_m_n_uv16qi (__arg1, * __a, __imm, __p);
-  *__a -= __imm * 16u;
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_wb_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
-  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
-  uint16x8_t __res = __builtin_mve_vddupq_m_n_uv8hi (__arg1, *__a, __imm, __p);
-  *__a -= __imm * 8u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_wb_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
-  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
-  uint32x4_t __res = __builtin_mve_vddupq_m_n_uv4si (__arg1, *__a, __imm, __p);
-  *__a -= __imm * 4u;
-  return __res;
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -2375,57 +2082,6 @@  __arm_vdwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16
   return __res;
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_n_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vidupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_n_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vidupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_n_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
-  return __builtin_mve_vidupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_wb_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
-  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
-  uint8x16_t __res = __builtin_mve_vidupq_m_n_uv16qi (__arg1, *__a, __imm, __p);
-  *__a += __imm * 16u;
-  return __res;
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_wb_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
-  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
-  uint16x8_t __res = __builtin_mve_vidupq_m_n_uv8hi (__arg1, *__a, __imm, __p);
-  *__a += __imm * 8u;
-  return __res;
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_wb_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
-  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
-  uint32x4_t __res = __builtin_mve_vidupq_m_n_uv4si (__arg1, *__a, __imm, __p);
-  *__a += __imm * 4u;
-  return __res;
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -4475,69 +4131,6 @@  __arm_vstrwq_scatter_shifted_offset (uint32_t * __base, uint32x4_t __offset, uin
  __arm_vstrwq_scatter_shifted_offset_u32 (__base, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_m_n_u8 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_m_n_u32 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_m_n_u16 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_m_wb_u8 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_m_wb_u16 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_m (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_m_wb_u32 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_u8 (uint32_t __a, const int __imm)
-{
- return __arm_vddupq_n_u8 (__a, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_u32 (uint32_t __a, const int __imm)
-{
- return __arm_vddupq_n_u32 (__a, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_u16 (uint32_t __a, const int __imm)
-{
- return __arm_vddupq_n_u16 (__a, __imm);
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -4622,111 +4215,6 @@  __arm_vdwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm)
  return __arm_vdwdupq_wb_u16 (__a, __b, __imm);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_m_n_u8 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_m_n_u32 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_m_n_u16 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_u8 (uint32_t __a, const int __imm)
-{
- return __arm_vidupq_n_u8 (__a, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_m_wb_u8 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_m_wb_u16 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_m (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_m_wb_u32 (__inactive, __a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_u32 (uint32_t __a, const int __imm)
-{
- return __arm_vidupq_n_u32 (__a, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_u16 (uint32_t __a, const int __imm)
-{
- return __arm_vidupq_n_u16 (__a, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_u8 (uint32_t * __a, const int __imm)
-{
- return __arm_vidupq_wb_u8 (__a, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_u16 (uint32_t * __a, const int __imm)
-{
- return __arm_vidupq_wb_u16 (__a, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_u32 (uint32_t * __a, const int __imm)
-{
- return __arm_vidupq_wb_u32 (__a, __imm);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_u8 (uint32_t * __a, const int __imm)
-{
- return __arm_vddupq_wb_u8 (__a, __imm);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_u16 (uint32_t * __a, const int __imm)
-{
- return __arm_vddupq_wb_u16 (__a, __imm);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_u32 (uint32_t * __a, const int __imm)
-{
- return __arm_vddupq_wb_u32 (__a, __imm);
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -4867,48 +4355,6 @@  __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_
  __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_x_n_u8 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_x_n_u16 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_x_n_u32 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_x_wb_u8 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_x_wb_u16 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vddupq_x_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vddupq_x_wb_u32 (__a, __imm, __p);
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -4951,48 +4397,6 @@  __arm_vdwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t
  return __arm_vdwdupq_x_wb_u32 (__a, __b, __imm, __p);
 }
 
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_u8 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_x_n_u8 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_u16 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_x_n_u16 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_u32 (uint32_t __a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_x_n_u32 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint8x16_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_x_wb_u8 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint16x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_x_wb_u16 (__a, __imm, __p);
-}
-
-__extension__ extern __inline uint32x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-__arm_vidupq_x_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p)
-{
- return __arm_vidupq_x_wb_u32 (__a, __imm, __p);
-}
-
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
@@ -6773,36 +6177,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \
   int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));})
 
-#define __arm_vidupq_x_u8(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u8 ((uint32_t) __p1, p2, p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vddupq_x_u8(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u8 ((uint32_t) __p1, p2, p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vidupq_x_u16(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u16 ((uint32_t) __p1, p2, p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vddupq_x_u16(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u16 ((uint32_t) __p1, p2, p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vidupq_x_u32(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u32 ((uint32_t) __p1, p2, p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vddupq_x_u32(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u32 ((uint32_t) __p1, p2, p3), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
 #define __arm_vadciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
@@ -6905,56 +6279,6 @@  extern void *__ARM_undef;
   int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \
   int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));})
 
-#define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
- __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
- int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), (uint32_t) __p1, p2, p3), \
- int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), (uint32_t) __p1, p2, p3), \
- int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), (uint32_t) __p1, p2, p3), \
- int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
- int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
- int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vddupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \
- __typeof(p1) __p1 = (p1); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
- int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), (uint32_t) __p1, p2, p3), \
- int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), (uint32_t) __p1, p2, p3), \
- int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), (uint32_t) __p1, p2, p3), \
- int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
- int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \
- int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));})
-
-#define __arm_vidupq_u16(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u16 ((uint32_t) __p0, p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
-
-#define __arm_vidupq_u32(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u32 ((uint32_t) __p0, p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
-
-#define __arm_vidupq_u8(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u8 ((uint32_t) __p0, p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
-
-#define __arm_vddupq_u16(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u16 ((uint32_t) __p0, p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
-
-#define __arm_vddupq_u32(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u32 ((uint32_t) __p0, p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
-
-#define __arm_vddupq_u8(p0,p1) ({ __typeof(p0) __p0 = (p0); \
-  _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
-  int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u8 ((uint32_t) __p0, p1), \
-  int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));})
-
 #define __arm_viwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \
   __typeof(p1) __p1 = (p1); \
   _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \