diff mbox series

[v2,15/16] Arm: Add MVE RTL patterns for Complex Addition, Multiply and FMA.

Message ID 20200925143131.GA30552@arm.com
State New
Headers show
Series middle-end Add support for SLP vectorization of complex number instructions. | expand

Commit Message

Tamar Christina Sept. 25, 2020, 2:31 p.m. UTC
Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:

  void f90 (int _Complex a[restrict N], int _Complex b[restrict N],
	    int _Complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  .L3:
	  mov     r3, r0
	  vldrw.32	q2, [r3]
	  mov     r3, r1
	  vldrw.32	q1, [r3]
	  mov     r3, r2
	  vcadd.i32       q3, q2, q1, #90
	  adds    r0, r0, #16
	  vstrw.32	q3, [r3]
	  adds    r1, r1, #16
	  adds    r2, r2, #16
	  le      lr, .L3
	  pop     {r4, r5, r6, r7, r8, pc}

which is not ideal due to register allocation and addressing mode issues with
MVE in general.  However -frename-register cleans up the register allocation:

  .L3:
	  mov     r5, r0
	  mov     r6, r1
	  vldrw.32	q2, [r5]
	  vldrw.32	q1, [r6]
	  mov     r7, r2
	  vcadd.i32       q3, q2, q1, #90
	  adds    r0, r0, #16
	  vstrw.32	q3, [r7]
	  adds    r1, r1, #16
	  adds    r2, r2, #16
	  le      lr, .L3
	  pop     {r4, r5, r6, r7, r8, pc}

but leaves the addressing mode problems.

Before this patch it generated a scalar loop

  .L2:
	  ldr     r7, [r0, r3, lsl #2]
	  ldr     r5, [r6, r3, lsl #2]
	  ldr     r4, [r1, r3, lsl #2]
	  subs    r5, r7, r5
	  ldr     r7, [lr, r3, lsl #2]
	  add     r4, r4, r7
	  str     r5, [r2, r3, lsl #2]
	  str     r4, [ip, r3, lsl #2]
	  adds    r3, r3, #2
	  cmp     r3, #200
	  bne     .L2
	  pop     {r4, r5, r6, r7, pc}



Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
Cross compiled arm-none-eabi and ran with -march=armv8.1-m.main+mve.fp
-mfloat-abi=hard -mfpu=auto and regression is on-going.

Unfortunately MVE does not currently implement auto-vectorization of floating
point values.  As such I cannot test this directly.  But since they share 90%
of the code with NEON these should just work whenever support is added so I
would still like to commit these.

To support this I had to refactor the MVE bits a bit.  This now uses the same
unspecs for both NEON and MVE and removes the unneeded different signed and
unsigned unspecs since they both point to the signed instruction.

I have tried multiple approaches to cleaning this up but I think this is the
nicest it can get given the slight ISA differences.

Ok for master if no issues?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/arm_mve.h (__arm_vcaddq_rot90_u8, __arm_vcaddq_rot270_u8,
	, __arm_vcaddq_rot90_s8, __arm_vcaddq_rot270_s8,
	__arm_vcaddq_rot90_u16, __arm_vcaddq_rot270_u16, __arm_vcaddq_rot90_s16,
	__arm_vcaddq_rot270_s16, __arm_vcaddq_rot90_u32,
	__arm_vcaddq_rot270_u32, __arm_vcaddq_rot90_s32,
	__arm_vcaddq_rot270_s32, __arm_vcmulq_rot90_f16,
	__arm_vcmulq_rot270_f16, __arm_vcmulq_rot180_f16,
	__arm_vcmulq_f16, __arm_vcaddq_rot90_f16, __arm_vcaddq_rot270_f16,
	__arm_vcmulq_rot90_f32, __arm_vcmulq_rot270_f32,
	__arm_vcmulq_rot180_f32, __arm_vcmulq_f32, __arm_vcaddq_rot90_f32,
	__arm_vcaddq_rot270_f32, __arm_vcmlaq_f16, __arm_vcmlaq_rot180_f16,
	__arm_vcmlaq_rot270_f16, __arm_vcmlaq_rot90_f16, __arm_vcmlaq_f32,
	__arm_vcmlaq_rot180_f32, __arm_vcmlaq_rot270_f32,
	__arm_vcmlaq_rot90_f32): Update builtin calls.
	* config/arm/arm_mve_builtins.def (vcaddq_rot90_u, vcaddq_rot270_u,
	vcaddq_rot90_s, vcaddq_rot270_s, vcaddq_rot90_f, vcaddq_rot270_f,
	vcmulq_f, vcmulq_rot90_f, vcmulq_rot180_f, vcmulq_rot270_f,
	vcmlaq_f, vcmlaq_rot90_f, vcmlaq_rot180_f, vcmlaq_rot270_f): Removed.
	(vcaddq_rot90, vcaddq_rot270, vcmulq, vcmulq_rot90, vcmulq_rot180,
	vcmulq_rot270, vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270):
	New.
	* config/arm/constraints.md (Dz): Include MVE.
	* config/arm/iterators.md (mve_rotsplit1, mve_rotsplit2): New.
	* config/arm/mve.md (VCADDQ_ROT270_S, VCADDQ_ROT90_S, VCADDQ_ROT270_U,
	VCADDQ_ROT90_U, VCADDQ_ROT270_F, VCADDQ_ROT90_F, VCMULQ_F,
	VCMULQ_ROT180_F, VCMULQ_ROT270_F, VCMULQ_ROT90_F, VCMLAQ_F,
	VCMLAQ_ROT180_F, VCMLAQ_ROT90_F, VCMLAQ_ROT270_F, VCADDQ_ROT270_S,
	VCADDQ_ROT270, VCADDQ_ROT90): Removed.
	(mve_rot, VCMUL): New.
	(mve_vcaddq_rot270_<supf><mode, mve_vcaddq_rot90_<supf><mode>,
	mve_vcaddq_rot270_f<mode>, mve_vcaddq_rot90_f<mode>, mve_vcmulq_f<mode,
	mve_vcmulq_rot180_f<mode>, mve_vcmulq_rot270_f<mode>,
	mve_vcmulq_rot90_f<mode>, mve_vcmlaq_f<mode>, mve_vcmlaq_rot180_f<mode>,
	mve_vcmlaq_rot270_f<mode>, mve_vcmlaq_rot90_f<mode>): Removed.
	(mve_vcmlaq<mve_rot><mode>, mve_vcmulq<mve_rot><mode>,
	mve_vcaddq<mve_rot><mode>, cadd<rot><mode>3, mve_vcaddq<mve_rot><mode>):
	New.
	* config/arm/neon.md (cadd<rot><mode>3, cml<fcmac1><rot_op><mode>4):
	Moved.
	(cmul<rot_op><mode>3): Exclude MVE types.
	* config/arm/unspecs.md (UNSPEC_VCMUL90, UNSPEC_VCMUL270): New.
	* config/arm/vec-common.md (cadd<rot><mode>3, cmul<rot_op><mode>3,
	arm_vcmla<rot><mode>, cml<fcmac1><rot_op><mode>4): New.

--

Comments

Tamar Christina Nov. 14, 2020, 3:11 p.m. UTC | #1
ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:32 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Ramana Radhakrishnan <Ramana.Radhakrishnan@arm.com>
> Subject: [PATCH v2 15/16]Arm: Add MVE RTL patterns for Complex Addition,
> Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the following C code:
> 
>   void f90 (int _Complex a[restrict N], int _Complex b[restrict N],
> 	    int _Complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   .L3:
> 	  mov     r3, r0
> 	  vldrw.32	q2, [r3]
> 	  mov     r3, r1
> 	  vldrw.32	q1, [r3]
> 	  mov     r3, r2
> 	  vcadd.i32       q3, q2, q1, #90
> 	  adds    r0, r0, #16
> 	  vstrw.32	q3, [r3]
> 	  adds    r1, r1, #16
> 	  adds    r2, r2, #16
> 	  le      lr, .L3
> 	  pop     {r4, r5, r6, r7, r8, pc}
> 
> which is not ideal due to register allocation and addressing mode issues with
> MVE in general.  However -frename-register cleans up the register allocation:
> 
>   .L3:
> 	  mov     r5, r0
> 	  mov     r6, r1
> 	  vldrw.32	q2, [r5]
> 	  vldrw.32	q1, [r6]
> 	  mov     r7, r2
> 	  vcadd.i32       q3, q2, q1, #90
> 	  adds    r0, r0, #16
> 	  vstrw.32	q3, [r7]
> 	  adds    r1, r1, #16
> 	  adds    r2, r2, #16
> 	  le      lr, .L3
> 	  pop     {r4, r5, r6, r7, r8, pc}
> 
> but leaves the addressing mode problems.
> 
> Before this patch it generated a scalar loop
> 
>   .L2:
> 	  ldr     r7, [r0, r3, lsl #2]
> 	  ldr     r5, [r6, r3, lsl #2]
> 	  ldr     r4, [r1, r3, lsl #2]
> 	  subs    r5, r7, r5
> 	  ldr     r7, [lr, r3, lsl #2]
> 	  add     r4, r4, r7
> 	  str     r5, [r2, r3, lsl #2]
> 	  str     r4, [ip, r3, lsl #2]
> 	  adds    r3, r3, #2
> 	  cmp     r3, #200
> 	  bne     .L2
> 	  pop     {r4, r5, r6, r7, pc}
> 
> 
> 
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> Cross compiled arm-none-eabi and ran with -march=armv8.1-
> m.main+mve.fp -mfloat-abi=hard -mfpu=auto and regression is on-going.
> 
> Unfortunately MVE does not currently implement auto-vectorization of
> floating point values.  As such I cannot test this directly.  But since they share
> 90% of the code with NEON these should just work whenever support is
> added so I would still like to commit these.
> 
> To support this I had to refactor the MVE bits a bit.  This now uses the same
> unspecs for both NEON and MVE and removes the unneeded different
> signed and unsigned unspecs since they both point to the signed instruction.
> 
> I have tried multiple approaches to cleaning this up but I think this is the
> nicest it can get given the slight ISA differences.
> 
> Ok for master if no issues?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/arm/arm_mve.h (__arm_vcaddq_rot90_u8,
> __arm_vcaddq_rot270_u8,
> 	, __arm_vcaddq_rot90_s8, __arm_vcaddq_rot270_s8,
> 	__arm_vcaddq_rot90_u16, __arm_vcaddq_rot270_u16,
> __arm_vcaddq_rot90_s16,
> 	__arm_vcaddq_rot270_s16, __arm_vcaddq_rot90_u32,
> 	__arm_vcaddq_rot270_u32, __arm_vcaddq_rot90_s32,
> 	__arm_vcaddq_rot270_s32, __arm_vcmulq_rot90_f16,
> 	__arm_vcmulq_rot270_f16, __arm_vcmulq_rot180_f16,
> 	__arm_vcmulq_f16, __arm_vcaddq_rot90_f16,
> __arm_vcaddq_rot270_f16,
> 	__arm_vcmulq_rot90_f32, __arm_vcmulq_rot270_f32,
> 	__arm_vcmulq_rot180_f32, __arm_vcmulq_f32,
> __arm_vcaddq_rot90_f32,
> 	__arm_vcaddq_rot270_f32, __arm_vcmlaq_f16,
> __arm_vcmlaq_rot180_f16,
> 	__arm_vcmlaq_rot270_f16, __arm_vcmlaq_rot90_f16,
> __arm_vcmlaq_f32,
> 	__arm_vcmlaq_rot180_f32, __arm_vcmlaq_rot270_f32,
> 	__arm_vcmlaq_rot90_f32): Update builtin calls.
> 	* config/arm/arm_mve_builtins.def (vcaddq_rot90_u,
> vcaddq_rot270_u,
> 	vcaddq_rot90_s, vcaddq_rot270_s, vcaddq_rot90_f,
> vcaddq_rot270_f,
> 	vcmulq_f, vcmulq_rot90_f, vcmulq_rot180_f, vcmulq_rot270_f,
> 	vcmlaq_f, vcmlaq_rot90_f, vcmlaq_rot180_f, vcmlaq_rot270_f):
> Removed.
> 	(vcaddq_rot90, vcaddq_rot270, vcmulq, vcmulq_rot90,
> vcmulq_rot180,
> 	vcmulq_rot270, vcmlaq, vcmlaq_rot90, vcmlaq_rot180,
> vcmlaq_rot270):
> 	New.
> 	* config/arm/constraints.md (Dz): Include MVE.
> 	* config/arm/iterators.md (mve_rotsplit1, mve_rotsplit2): New.
> 	* config/arm/mve.md (VCADDQ_ROT270_S, VCADDQ_ROT90_S,
> VCADDQ_ROT270_U,
> 	VCADDQ_ROT90_U, VCADDQ_ROT270_F, VCADDQ_ROT90_F,
> VCMULQ_F,
> 	VCMULQ_ROT180_F, VCMULQ_ROT270_F, VCMULQ_ROT90_F,
> VCMLAQ_F,
> 	VCMLAQ_ROT180_F, VCMLAQ_ROT90_F, VCMLAQ_ROT270_F,
> VCADDQ_ROT270_S,
> 	VCADDQ_ROT270, VCADDQ_ROT90): Removed.
> 	(mve_rot, VCMUL): New.
> 	(mve_vcaddq_rot270_<supf><mode,
> mve_vcaddq_rot90_<supf><mode>,
> 	mve_vcaddq_rot270_f<mode>, mve_vcaddq_rot90_f<mode>,
> mve_vcmulq_f<mode,
> 	mve_vcmulq_rot180_f<mode>, mve_vcmulq_rot270_f<mode>,
> 	mve_vcmulq_rot90_f<mode>, mve_vcmlaq_f<mode>,
> mve_vcmlaq_rot180_f<mode>,
> 	mve_vcmlaq_rot270_f<mode>, mve_vcmlaq_rot90_f<mode>):
> Removed.
> 	(mve_vcmlaq<mve_rot><mode>, mve_vcmulq<mve_rot><mode>,
> 	mve_vcaddq<mve_rot><mode>, cadd<rot><mode>3,
> mve_vcaddq<mve_rot><mode>):
> 	New.
> 	* config/arm/neon.md (cadd<rot><mode>3,
> cml<fcmac1><rot_op><mode>4):
> 	Moved.
> 	(cmul<rot_op><mode>3): Exclude MVE types.
> 	* config/arm/unspecs.md (UNSPEC_VCMUL90, UNSPEC_VCMUL270):
> New.
> 	* config/arm/vec-common.md (cadd<rot><mode>3,
> cmul<rot_op><mode>3,
> 	arm_vcmla<rot><mode>, cml<fcmac1><rot_op><mode>4): New.
> 
> --
Kyrylo Tkachov Nov. 16, 2020, 9:24 a.m. UTC | #2
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: 25 September 2020 15:32
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH v2 15/16]Arm: Add MVE RTL patterns for Complex Addition,
> Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the
> following C code:
> 
>   void f90 (int _Complex a[restrict N], int _Complex b[restrict N],
> 	    int _Complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   .L3:
> 	  mov     r3, r0
> 	  vldrw.32	q2, [r3]
> 	  mov     r3, r1
> 	  vldrw.32	q1, [r3]
> 	  mov     r3, r2
> 	  vcadd.i32       q3, q2, q1, #90
> 	  adds    r0, r0, #16
> 	  vstrw.32	q3, [r3]
> 	  adds    r1, r1, #16
> 	  adds    r2, r2, #16
> 	  le      lr, .L3
> 	  pop     {r4, r5, r6, r7, r8, pc}
> 
> which is not ideal due to register allocation and addressing mode issues with
> MVE in general.  However -frename-register cleans up the register allocation:
> 
>   .L3:
> 	  mov     r5, r0
> 	  mov     r6, r1
> 	  vldrw.32	q2, [r5]
> 	  vldrw.32	q1, [r6]
> 	  mov     r7, r2
> 	  vcadd.i32       q3, q2, q1, #90
> 	  adds    r0, r0, #16
> 	  vstrw.32	q3, [r7]
> 	  adds    r1, r1, #16
> 	  adds    r2, r2, #16
> 	  le      lr, .L3
> 	  pop     {r4, r5, r6, r7, r8, pc}
> 
> but leaves the addressing mode problems.
> 
> Before this patch it generated a scalar loop
> 
>   .L2:
> 	  ldr     r7, [r0, r3, lsl #2]
> 	  ldr     r5, [r6, r3, lsl #2]
> 	  ldr     r4, [r1, r3, lsl #2]
> 	  subs    r5, r7, r5
> 	  ldr     r7, [lr, r3, lsl #2]
> 	  add     r4, r4, r7
> 	  str     r5, [r2, r3, lsl #2]
> 	  str     r4, [ip, r3, lsl #2]
> 	  adds    r3, r3, #2
> 	  cmp     r3, #200
> 	  bne     .L2
> 	  pop     {r4, r5, r6, r7, pc}
> 
> 
> 
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> Cross compiled arm-none-eabi and ran with -march=armv8.1-
> m.main+mve.fp
> -mfloat-abi=hard -mfpu=auto and regression is on-going.
> 
> Unfortunately MVE does not currently implement auto-vectorization of
> floating
> point values.  As such I cannot test this directly.  But since they share 90%
> of the code with NEON these should just work whenever support is added so
> I
> would still like to commit these.

I believe MVE modes are now supported for autovectorisation since 29c650cd899496c4f9bc069d03d0d7ecfb632176
Could you try out the floating-point modes too?

> 
> To support this I had to refactor the MVE bits a bit.  This now uses the same
> unspecs for both NEON and MVE and removes the unneeded different signed
> and
> unsigned unspecs since they both point to the signed instruction.
> 
> I have tried multiple approaches to cleaning this up but I think this is the
> nicest it can get given the slight ISA differences.
> 
> Ok for master if no issues?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/arm/arm_mve.h (__arm_vcaddq_rot90_u8,
> __arm_vcaddq_rot270_u8,
> 	, __arm_vcaddq_rot90_s8, __arm_vcaddq_rot270_s8,
> 	__arm_vcaddq_rot90_u16, __arm_vcaddq_rot270_u16,
> __arm_vcaddq_rot90_s16,
> 	__arm_vcaddq_rot270_s16, __arm_vcaddq_rot90_u32,
> 	__arm_vcaddq_rot270_u32, __arm_vcaddq_rot90_s32,
> 	__arm_vcaddq_rot270_s32, __arm_vcmulq_rot90_f16,
> 	__arm_vcmulq_rot270_f16, __arm_vcmulq_rot180_f16,
> 	__arm_vcmulq_f16, __arm_vcaddq_rot90_f16,
> __arm_vcaddq_rot270_f16,
> 	__arm_vcmulq_rot90_f32, __arm_vcmulq_rot270_f32,
> 	__arm_vcmulq_rot180_f32, __arm_vcmulq_f32,
> __arm_vcaddq_rot90_f32,
> 	__arm_vcaddq_rot270_f32, __arm_vcmlaq_f16,
> __arm_vcmlaq_rot180_f16,
> 	__arm_vcmlaq_rot270_f16, __arm_vcmlaq_rot90_f16,
> __arm_vcmlaq_f32,
> 	__arm_vcmlaq_rot180_f32, __arm_vcmlaq_rot270_f32,
> 	__arm_vcmlaq_rot90_f32): Update builtin calls.
> 	* config/arm/arm_mve_builtins.def (vcaddq_rot90_u,
> vcaddq_rot270_u,
> 	vcaddq_rot90_s, vcaddq_rot270_s, vcaddq_rot90_f,
> vcaddq_rot270_f,
> 	vcmulq_f, vcmulq_rot90_f, vcmulq_rot180_f, vcmulq_rot270_f,
> 	vcmlaq_f, vcmlaq_rot90_f, vcmlaq_rot180_f, vcmlaq_rot270_f):
> Removed.
> 	(vcaddq_rot90, vcaddq_rot270, vcmulq, vcmulq_rot90,
> vcmulq_rot180,
> 	vcmulq_rot270, vcmlaq, vcmlaq_rot90, vcmlaq_rot180,
> vcmlaq_rot270):
> 	New.
> 	* config/arm/constraints.md (Dz): Include MVE.
> 	* config/arm/iterators.md (mve_rotsplit1, mve_rotsplit2): New.
> 	* config/arm/mve.md (VCADDQ_ROT270_S, VCADDQ_ROT90_S,
> VCADDQ_ROT270_U,
> 	VCADDQ_ROT90_U, VCADDQ_ROT270_F, VCADDQ_ROT90_F,
> VCMULQ_F,
> 	VCMULQ_ROT180_F, VCMULQ_ROT270_F, VCMULQ_ROT90_F,
> VCMLAQ_F,
> 	VCMLAQ_ROT180_F, VCMLAQ_ROT90_F, VCMLAQ_ROT270_F,
> VCADDQ_ROT270_S,
> 	VCADDQ_ROT270, VCADDQ_ROT90): Removed.
> 	(mve_rot, VCMUL): New.
> 	(mve_vcaddq_rot270_<supf><mode,
> mve_vcaddq_rot90_<supf><mode>,
> 	mve_vcaddq_rot270_f<mode>, mve_vcaddq_rot90_f<mode>,
> mve_vcmulq_f<mode,
> 	mve_vcmulq_rot180_f<mode>, mve_vcmulq_rot270_f<mode>,
> 	mve_vcmulq_rot90_f<mode>, mve_vcmlaq_f<mode>,
> mve_vcmlaq_rot180_f<mode>,
> 	mve_vcmlaq_rot270_f<mode>, mve_vcmlaq_rot90_f<mode>):
> Removed.
> 	(mve_vcmlaq<mve_rot><mode>, mve_vcmulq<mve_rot><mode>,
> 	mve_vcaddq<mve_rot><mode>, cadd<rot><mode>3,
> mve_vcaddq<mve_rot><mode>):
> 	New.
> 	* config/arm/neon.md (cadd<rot><mode>3,
> cml<fcmac1><rot_op><mode>4):
> 	Moved.
> 	(cmul<rot_op><mode>3): Exclude MVE types.
> 	* config/arm/unspecs.md (UNSPEC_VCMUL90, UNSPEC_VCMUL270):
> New.
> 	* config/arm/vec-common.md (cadd<rot><mode>3,
> cmul<rot_op><mode>3,
> 	arm_vcmla<rot><mode>, cml<fcmac1><rot_op><mode>4): New.
> 
> --
diff mbox series

Patch

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index a801705ced582105df60ccdc79a7500b320e12d4..bd379a5d915ad1d682f2d92554f4bd03c2762733 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -3983,14 +3983,14 @@  __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_uv16qi (__a, __b);
+  return __builtin_mve_vcaddq_rot90v16qi (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_uv16qi (__a, __b);
+  return __builtin_mve_vcaddq_rot270v16qi (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
@@ -4522,14 +4522,14 @@  __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_sv16qi (__a, __b);
+  return __builtin_mve_vcaddq_rot90v16qi (__a, __b);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_sv16qi (__a, __b);
+  return __builtin_mve_vcaddq_rot270v16qi (__a, __b);
 }
 
 __extension__ extern __inline int8x16_t
@@ -4823,14 +4823,14 @@  __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_uv8hi (__a, __b);
+  return __builtin_mve_vcaddq_rot90v8hi (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_uv8hi (__a, __b);
+  return __builtin_mve_vcaddq_rot270v8hi (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
@@ -5362,14 +5362,14 @@  __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_sv8hi (__a, __b);
+  return __builtin_mve_vcaddq_rot90v8hi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_sv8hi (__a, __b);
+  return __builtin_mve_vcaddq_rot270v8hi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
@@ -5663,14 +5663,14 @@  __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_uv4si (__a, __b);
+  return __builtin_mve_vcaddq_rot90v4si (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_uv4si (__a, __b);
+  return __builtin_mve_vcaddq_rot270v4si (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
@@ -6202,14 +6202,14 @@  __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_sv4si (__a, __b);
+  return __builtin_mve_vcaddq_rot90v4si (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_sv4si (__a, __b);
+  return __builtin_mve_vcaddq_rot270v4si (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
@@ -17380,42 +17380,42 @@  __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot90_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_rot90_fv8hf (__a, __b);
+  return __builtin_mve_vcmulq_rot90v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot270_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_rot270_fv8hf (__a, __b);
+  return __builtin_mve_vcmulq_rot270v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot180_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_rot180_fv8hf (__a, __b);
+  return __builtin_mve_vcmulq_rot180v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_fv8hf (__a, __b);
+  return __builtin_mve_vcmulqv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_fv8hf (__a, __b);
+  return __builtin_mve_vcaddq_rot90v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_fv8hf (__a, __b);
+  return __builtin_mve_vcaddq_rot270v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
@@ -17632,42 +17632,42 @@  __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot90_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_rot90_fv4sf (__a, __b);
+  return __builtin_mve_vcmulq_rot90v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot270_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_rot270_fv4sf (__a, __b);
+  return __builtin_mve_vcmulq_rot270v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot180_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_rot180_fv4sf (__a, __b);
+  return __builtin_mve_vcmulq_rot180v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_fv4sf (__a, __b);
+  return __builtin_mve_vcmulqv4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_fv4sf (__a, __b);
+  return __builtin_mve_vcaddq_rot90v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_fv4sf (__a, __b);
+  return __builtin_mve_vcaddq_rot270v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
@@ -17822,28 +17822,28 @@  __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaqv8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot180_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_rot180_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot180v8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot270_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_rot270_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot270v8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot90_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_rot90_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot90v8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
@@ -18130,28 +18130,28 @@  __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaqv4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot180_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_rot180_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot180v4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot270_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_rot270_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot270v4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot90_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_rot90_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot90v4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index 753e40a951d071c1ab77476a1cc4779e91689178..f31248595b7bef4eed4963dbbbc72371b15b8af8 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -125,8 +125,6 @@  VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot90_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot270_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si)
@@ -202,8 +200,6 @@  VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot270_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhaddq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhaddq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, veorq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot90_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot270_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vbrsrq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vbicq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vandq_s, v16qi, v8hi, v4si)
@@ -264,12 +260,6 @@  VAR2 (BINOP_NONE_NONE_NONE, vmaxnmq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vmaxnmavq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vmaxnmaq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, veorq_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot90_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot270_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vbicq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vandq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vaddq_n_f, v8hf, v4sf)
@@ -472,10 +462,6 @@  VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmsq_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmasq_n_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_n_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrntq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrnbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vrshrntq_n_s, v8hi, v4si)
@@ -904,3 +890,15 @@  VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_vec_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_carry_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_vec_u, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_carry_u, v16qi, v8hi, v4si)
+
+/* optabs without any suffixes.  */
+VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf)
+VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot270, v16qi, v8hi, v4si, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq, v8hf, v4sf)
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index ff229aa98455e05470801d7110b1aaf5ab3e0d25..e166c11a4c78e8dbcdd25d5cfda14cc36daad5b2 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -310,7 +310,7 @@  (define_constraint "Dz"
  "@internal
   In ARM/Thumb-2 state a vector of constant zeros."
  (and (match_code "const_vector")
-      (match_test "TARGET_NEON && op == CONST0_RTX (mode)")))
+      (match_test "(TARGET_NEON || TARGET_HAVE_MVE) && op == CONST0_RTX (mode)")))
 
 (define_constraint "Da"
  "@internal
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a4da379670ce3428253664d44d2e18415a8f49ab..63d4ebe786c5f1262851e462942127adc4a5e92c 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1168,6 +1168,21 @@  (define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
 			    (UNSPEC_VCMLS "180")
 			    (UNSPEC_VCMLS180 "180")])
 
+(define_int_attr mve_rotsplit1 [(UNSPEC_VCMLA "")
+				(UNSPEC_VCMLA180 "")
+				(UNSPEC_VCMUL "")
+				(UNSPEC_VCMUL180 "")
+				(UNSPEC_VCMLS "_rot270")
+				(UNSPEC_VCMLS180 "_rot90")])
+
+(define_int_attr mve_rotsplit2 [(UNSPEC_VCMLA "_rot90")
+				(UNSPEC_VCMLA180 "_rot270")
+				(UNSPEC_VCMUL "_rot90")
+				(UNSPEC_VCMUL180 "_rot270")
+				(UNSPEC_VCMLS "_rot180")
+				(UNSPEC_VCMLS180 "_rot180")])
+
+
 (define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA180 "a")
 			 (UNSPEC_VCMLS "s") (UNSPEC_VCMLS180 "s")])
 
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 465b39a51b3a258295ed764f0e742932e5d59225..b8cd74176a41572008d86e6a074a626ccf35a69c 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -42,7 +42,7 @@  (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F
 			 VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U VADDLVQ_P_S
 			 VADDLVQ_P_U VCMPNEQ_U VCMPNEQ_S VSHLQ_S VSHLQ_U VABDQ_S
 			 VADDQ_N_S VADDVAQ_S VADDVQ_P_S VANDQ_S VBICQ_S
-			 VBRSRQ_N_S VCADDQ_ROT270_S VCADDQ_ROT90_S VCMPEQQ_S
+			 VBRSRQ_N_S VCMPEQQ_S
 			 VCMPEQQ_N_S VCMPNEQ_N_S VEORQ_S VHADDQ_S VHADDQ_N_S
 			 VHSUBQ_S VHSUBQ_N_S VMAXQ_S VMAXVQ_S VMINQ_S VMINVQ_S
 			 VMLADAVQ_S VMULHQ_S VMULLBQ_INT_S VMULLTQ_INT_S VMULQ_S
@@ -51,7 +51,7 @@  (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F
 			 VQSUBQ_N_S VRHADDQ_S VRMULHQ_S VRSHLQ_S VRSHLQ_N_S
 			 VRSHRQ_N_S VSHLQ_N_S VSHLQ_R_S VSUBQ_S VSUBQ_N_S
 			 VABDQ_U VADDQ_N_U VADDVAQ_U VADDVQ_P_U VANDQ_U VBICQ_U
-			 VBRSRQ_N_U VCADDQ_ROT270_U VCADDQ_ROT90_U VCMPEQQ_U
+			 VBRSRQ_N_U VCMPEQQ_U
 			 VCMPEQQ_N_U VCMPNEQ_N_U VEORQ_U VHADDQ_U VHADDQ_N_U
 			 VHSUBQ_U VHSUBQ_N_U VMAXQ_U VMAXVQ_U VMINQ_U VMINVQ_U
 			 VMLADAVQ_U VMULHQ_U VMULLBQ_INT_U VMULLTQ_INT_U VMULQ_U
@@ -66,10 +66,9 @@  (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F
 			 VQDMULHQ_S VQRDMULHQ_N_S VQRDMULHQ_S VQSHLUQ_N_S
 			 VCMPCSQ_N_U VCMPCSQ_U VCMPHIQ_N_U VCMPHIQ_U VABDQ_M_S
 			 VABDQ_M_U VABDQ_F VADDQ_N_F VANDQ_F VBICQ_F
-			 VCADDQ_ROT270_F VCADDQ_ROT90_F VCMPEQQ_F VCMPEQQ_N_F
+			 VCMPEQQ_F VCMPEQQ_N_F
 			 VCMPGEQ_F VCMPGEQ_N_F VCMPGTQ_F VCMPGTQ_N_F VCMPLEQ_F
 			 VCMPLEQ_N_F VCMPLTQ_F VCMPLTQ_N_F VCMPNEQ_F VCMPNEQ_N_F
-			 VCMULQ_F VCMULQ_ROT180_F VCMULQ_ROT270_F VCMULQ_ROT90_F
 			 VEORQ_F VMAXNMAQ_F VMAXNMAVQ_F VMAXNMQ_F VMAXNMVQ_F
 			 VMINNMAQ_F VMINNMAVQ_F VMINNMQ_F VMINNMVQ_F VMULQ_F
 			 VMULQ_N_F VORNQ_F VORRQ_F VSUBQ_F VADDLVAQ_U
@@ -112,18 +111,18 @@  (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F
 			 VMLSDAVAXQ_S VMLSDAVAQ_S VMLADAVAXQ_S
 			 VCMPGEQ_M_F VCMPGTQ_M_N_F VMLSLDAVQ_P_S VRMLALDAVHAXQ_S
 			 VMLSLDAVXQ_P_S VFMAQ_F VMLSLDAVAQ_S VQSHRUNBQ_N_S
-			 VQRSHRUNTQ_N_S VCMLAQ_F VMINNMAQ_M_F VFMASQ_N_F
+			 VQRSHRUNTQ_N_S VMINNMAQ_M_F VFMASQ_N_F
 			 VDUPQ_M_N_F VCMPGTQ_M_F VCMPLTQ_M_F VRMLSLDAVHQ_P_S
 			 VQSHRUNTQ_N_S VABSQ_M_F VMAXNMAVQ_P_F VFMAQ_N_F
 			 VRMLSLDAVHXQ_P_S VREV32Q_M_F VRMLSLDAVHAQ_S
 			 VRMLSLDAVHAXQ_S VCMPLTQ_M_N_F VCMPNEQ_M_F VRNDAQ_M_F
 			 VRNDPQ_M_F VADDLVAQ_P_S VQMOVUNBQ_M_S VCMPLEQ_M_F
-			 VCMLAQ_ROT180_F VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F
-			 VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F VCMLAQ_ROT90_F
+			 VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F
+			 VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F
 			 VQMOVUNTQ_M_S VREV64Q_M_F VNEGQ_M_F VRNDMQ_M_F
 			 VCMPLEQ_M_N_F VCMPGEQ_M_N_F VRNDNQ_M_F VMINNMAVQ_P_F
 			 VCMPNEQ_M_N_F VRMLALDAVHQ_P_S VRMLALDAVHXQ_P_S
-			 VCMPEQQ_M_N_F VCMLAQ_ROT270_F VMAXNMAQ_M_F VRNDQ_M_F
+			 VCMPEQQ_M_N_F VMAXNMAQ_M_F VRNDQ_M_F
 			 VMLALDAVQ_P_U VMLALDAVQ_P_S VQMOVNBQ_M_S VQMOVNBQ_M_U
 			 VMOVLTQ_M_U VMOVLTQ_M_S VMOVNBQ_M_U VMOVNBQ_M_S
 			 VRSHRNTQ_N_U VRSHRNTQ_N_S VORRQ_M_N_S VORRQ_M_N_U
@@ -240,9 +239,8 @@  (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
 		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VANDQ_S "s")
 		       (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
-		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
-		       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
-		       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
+		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u")
+		       (VCMPEQQ_S "s") (VCMPEQQ_U "u")
 		       (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s")
 		       (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u")
 		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
@@ -421,6 +419,19 @@  (define_mode_attr V_extr_elem [(V16QI "u8") (V8HI "u16") (V4SI "32")
 (define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w")
 						(V8HF "=w") (V4SF "=&w")])
 
+(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
+			  (UNSPEC_VCADD270 "_rot270")
+			  (UNSPEC_VCMLA "")
+			  (UNSPEC_VCMLA90 "_rot90")
+			  (UNSPEC_VCMLA180 "_rot180")
+			  (UNSPEC_VCMLA270 "_rot270")
+			  (UNSPEC_VCMUL "")
+			  (UNSPEC_VCMUL90 "_rot90")
+			  (UNSPEC_VCMUL180 "_rot180")
+			  (UNSPEC_VCMUL270 "_rot270")])
+
+(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90 UNSPEC_VCMUL180 UNSPEC_VCMUL270])
+
 (define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U])
 (define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S])
 (define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U])
@@ -454,8 +465,6 @@  (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
 (define_int_iterator VANDQ [VANDQ_U VANDQ_S])
 (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
 (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
-(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
-(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S])
 (define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
 (define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
 (define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
@@ -1585,34 +1594,28 @@  (define_insn "mve_vbrsrq_n_<supf><mode>"
 ])
 
 ;;
-;; [vcaddq_rot270_s, vcaddq_rot270_u])
+;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270])
 ;;
-(define_insn "mve_vcaddq_rot270_<supf><mode>"
+(define_insn "mve_vcaddq<mve_rot><mode>"
   [
    (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCADDQ_ROT270))
+	 VCADD))
   ]
   "TARGET_HAVE_MVE"
-  "vcadd.i%#<V_sz_elem>	%q0, %q1, %q2, #270"
+  "vcadd.i%#<V_sz_elem>	%q0, %q1, %q2, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
-;;
-;; [vcaddq_rot90_u, vcaddq_rot90_s])
-;;
-(define_insn "mve_vcaddq_rot90_<supf><mode>"
-  [
-   (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
-		       (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCADDQ_ROT90))
-  ]
+;; Auto vectorizer pattern for int vcadd
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:MVE_2 0 "register_operand")
+	(unspec:MVE_2 [(match_operand:MVE_2 1 "register_operand")
+		       (match_operand:MVE_2 2 "register_operand")]
+	  VCADD))]
   "TARGET_HAVE_MVE"
-  "vcadd.i%#<V_sz_elem>	%q0, %q1, %q2, #90"
-  [(set_attr "type" "mve_move")
-])
+)
 
 ;;
 ;; [vcmpcsq_n_u])
@@ -2665,32 +2668,17 @@  (define_insn "mve_vbicq_n_<supf><mode>"
 ])
 
 ;;
-;; [vcaddq_rot270_f])
+;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270])
 ;;
-(define_insn "mve_vcaddq_rot270_f<mode>"
+(define_insn "mve_vcaddq<mve_rot><mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCADDQ_ROT270_F))
+	 VCADD))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcadd.f%#<V_sz_elem>	%q0, %q1, %q2, #270"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcaddq_rot90_f])
-;;
-(define_insn "mve_vcaddq_rot90_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCADDQ_ROT90_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcadd.f%#<V_sz_elem>	%q0, %q1, %q2, #90"
+  "vcadd.f%#<V_sz_elem>	%q0, %q1, %q2, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2875,62 +2863,17 @@  (define_insn "mve_vcmpneq_n_f<mode>"
 ])
 
 ;;
-;; [vcmulq_f])
-;;
-(define_insn "mve_vcmulq_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #0"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmulq_rot180_f])
-;;
-(define_insn "mve_vcmulq_rot180_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_ROT180_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #180"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmulq_rot270_f])
-;;
-(define_insn "mve_vcmulq_rot270_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_ROT270_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #270"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmulq_rot90_f])
+;; [vcmulq, vcmulq_rot90, vcmulq_rot180, vcmulq_rot270])
 ;;
-(define_insn "mve_vcmulq_rot90_f<mode>"
+(define_insn "mve_vcmulq<mve_rot><mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_ROT90_F))
+	 VCMUL))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #90"
+  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
@@ -4692,66 +4635,20 @@  (define_insn "mve_vaddlvaq_p_<supf>v4si"
   [(set_attr "type" "mve_move")
    (set_attr "length""8")])
 ;;
-;; [vcmlaq_f])
-;;
-(define_insn "mve_vcmlaq_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #0"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmlaq_rot180_f])
-;;
-(define_insn "mve_vcmlaq_rot180_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_ROT180_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #180"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmlaq_rot270_f])
-;;
-(define_insn "mve_vcmlaq_rot270_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_ROT270_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #270"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmlaq_rot90_f])
+;; [vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270])
 ;;
-(define_insn "mve_vcmlaq_rot90_f<mode>"
+(define_insn "mve_vcmlaq<mve_rot><mode>"
   [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_ROT90_F))
+   (set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
+	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
+		       (match_operand:MVE_0 2 "s_register_operand" "w,w")
+		       (match_operand:MVE_0 3 "s_register_operand" "w,w")]
+	 VCMLA))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #90"
+  "@
+   vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>
+   vcmul.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 2ccbf99883ec6a4808f16453e3d07454f3e3077e..d87ae8adbf1292bebbb9046e67038fcaa5f23040 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3174,14 +3174,6 @@  (define_insn "neon_vcadd<rot><mode>"
   [(set_attr "type" "neon_fcadd")]
 )
 
-(define_expand "cadd<rot><mode>3"
-  [(set (match_operand:VF 0 "register_operand")
-	(unspec:VF [(match_operand:VF 1 "register_operand")
-		    (match_operand:VF 2 "register_operand")]
-		    VCADD))]
-  "TARGET_COMPLEX"
-)
-
 (define_insn "neon_vcmla<rot><mode>"
   [(set (match_operand:VF 0 "register_operand" "=w")
 	(plus:VF (match_operand:VF 1 "register_operand" "0")
@@ -3238,32 +3230,13 @@  (define_insn "neon_vcmlaq_lane<rot><mode>"
   [(set_attr "type" "neon_fcmla")]
 )
 
-
-;; The complex mla/mls operations always need to expand to two instructions.
-;; The first operation does half the computation and the second does the
-;; remainder.  Because of this, expand early.
-(define_expand "cml<fcmac1><rot_op><mode>4"
-  [(set (match_operand:VF 0 "register_operand")
-	(plus:VF (match_operand:VF 1 "register_operand")
-		 (unspec:VF [(match_operand:VF 2 "register_operand")
-			     (match_operand:VF 3 "register_operand")]
-			     VCMLA_OP)))]
-  "TARGET_COMPLEX"
-{
-  emit_insn (gen_neon_vcmla<rotsplit1><mode> (operands[0], operands[1],
-					      operands[2], operands[3]));
-  emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], operands[0],
-					      operands[2], operands[3]));
-  DONE;
-})
-
 ;; The complex mul operations always need to expand to two instructions.
 ;; The first operation does half the computation and the second does the
 ;; remainder.  Because of this, expand early.
 (define_expand "cmul<rot_op><mode>3"
-  [(set (match_operand:VF 0 "register_operand")
-	(unspec:VF [(match_operand:VF 1 "register_operand")
-		    (match_operand:VF 2 "register_operand")]
+  [(set (match_operand:VDF 0 "register_operand")
+	(unspec:VDF [(match_operand:VDF 1 "register_operand")
+		     (match_operand:VDF 2 "register_operand")]
 		    VCMUL_OP))]
   "TARGET_COMPLEX"
 {
@@ -3276,6 +3249,7 @@  (define_expand "cmul<rot_op><mode>3"
   DONE;
 })
 
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "neon_<sup>dot<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index d1b2824a0fe76f62d69c18dcec2f47dfb75b586e..1251aace01b42d393a40467906c610c45de0412a 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -511,7 +511,9 @@  (define_c_enum "unspec" [
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
   UNSPEC_VCMUL
+  UNSPEC_VCMUL90
   UNSPEC_VCMUL180
+  UNSPEC_VCMUL270
   UNSPEC_VCMLS
   UNSPEC_VCMLS180
   UNSPEC_MATMUL_S
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index c3c86c46355e6ace6c90e189b4160dbe4cd9caf3..8affffb68e6928bc4210656134677ad5f2915426 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -191,3 +191,70 @@  (define_expand "vec_set<mode>"
 					       GEN_INT (elem), operands[0]));
   DONE;
 })
+
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:VF 0 "register_operand")
+	(unspec:VF [(match_operand:VF 1 "register_operand")
+		    (match_operand:VF 2 "register_operand")]
+		   VCADD))]
+  "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_NEON_<MODE>_ARITH)"
+)
+
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:VQ_HSF 0 "register_operand")
+        (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
+			(match_operand:VQ_HSF 2 "register_operand")]
+		       VCMUL_OP))]
+  "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT)"
+{
+  if (TARGET_COMPLEX)
+    {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      emit_move_insn (tmp, CONST0_RTX (<MODE>mode));
+      emit_insn (gen_neon_vcmla<rotsplit1><mode> (operands[0], tmp,
+                                                  operands[1], operands[2]));
+      emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], operands[0],
+                                                  operands[1], operands[2]));
+    }
+  else
+    {
+      emit_insn (gen_mve_vcmulq<mve_rotsplit1><mode> (operands[0], operands[1],
+                                                      operands[2]));
+      emit_insn (gen_mve_vcmulq<mve_rotsplit2><mode> (operands[0], operands[1],
+                                                      operands[2]));
+    }
+  DONE;
+})
+
+(define_expand "arm_vcmla<rot><mode>"
+  [(set (match_operand:VF 0 "register_operand")
+	(plus:VF (match_operand:VF 1 "register_operand")
+		 (unspec:VF [(match_operand:VF 2 "register_operand")
+			     (match_operand:VF 3 "register_operand")]
+			     VCMLA)))]
+  "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_NEON_<MODE>_ARITH)"
+)
+
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:VF 0 "register_operand")
+	(plus:VF (match_operand:VF 1 "register_operand")
+		 (unspec:VF [(match_operand:VF 2 "register_operand")
+			     (match_operand:VF 3 "register_operand")]
+			    VCMLA_OP)))]
+  "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_NEON_<MODE>_ARITH)"
+{
+  emit_insn (gen_arm_vcmla<rotsplit1><mode> (operands[0], operands[1],
+					     operands[2], operands[3]));
+  emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], operands[0],
+					     operands[2], operands[3]));
+  DONE;
+})