diff mbox series

[v2,11/16] AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.

Message ID 20200925143024.GA25584@arm.com
State New
Headers show
Series middle-end Add support for SLP vectorization of complex number instructions. | expand

Commit Message

Tamar Christina Sept. 25, 2020, 2:30 p.m. UTC
Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:

  void f90 (float complex a[restrict N], float complex b[restrict N],
	    float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
	  mov     x3, 0
	  mov     x4, 400
	  ptrue   p1.b, all
	  whilelo p0.s, xzr, x4
	  .p2align 3,,7
  .L2:
	  ld1w    z0.s, p0/z, [x0, x3, lsl 2]
	  ld1w    z1.s, p0/z, [x1, x3, lsl 2]
	  fcadd   z0.s, p1/m, z0.s, z1.s, #90
	  st1w    z0.s, p0, [x2, x3, lsl 2]
	  incw    x3
	  whilelo p0.s, x3, x4
	  b.any   .L2
	  ret

instead of

  f90:
	  mov     x3, 0
	  mov     x4, 0
	  mov     w5, 200
	  whilelo p0.s, wzr, w5
	  .p2align 3,,7
  .L2:
	  ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
	  ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
	  fsub    z0.s, z4.s, z3.s
	  fadd    z1.s, z2.s, z5.s
	  st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
	  incw    x4
	  inch    x3
	  whilelo p0.s, w4, w5
	  b.any   .L2
	  ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
	* config/aarch64/iterators.md (sve_rot1, sve_rot2): New.

--

Comments

Tamar Christina Nov. 14, 2020, 3:12 p.m. UTC | #1
ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:30 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: [PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex
> Addition, Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the following C code:
> 
>   void f90 (float complex a[restrict N], float complex b[restrict N],
> 	    float complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   f90:
> 	  mov     x3, 0
> 	  mov     x4, 400
> 	  ptrue   p1.b, all
> 	  whilelo p0.s, xzr, x4
> 	  .p2align 3,,7
>   .L2:
> 	  ld1w    z0.s, p0/z, [x0, x3, lsl 2]
> 	  ld1w    z1.s, p0/z, [x1, x3, lsl 2]
> 	  fcadd   z0.s, p1/m, z0.s, z1.s, #90
> 	  st1w    z0.s, p0, [x2, x3, lsl 2]
> 	  incw    x3
> 	  whilelo p0.s, x3, x4
> 	  b.any   .L2
> 	  ret
> 
> instead of
> 
>   f90:
> 	  mov     x3, 0
> 	  mov     x4, 0
> 	  mov     w5, 200
> 	  whilelo p0.s, wzr, w5
> 	  .p2align 3,,7
>   .L2:
> 	  ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
> 	  ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
> 	  fsub    z0.s, z4.s, z3.s
> 	  fadd    z1.s, z2.s, z5.s
> 	  st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
> 	  incw    x4
> 	  inch    x3
> 	  whilelo p0.s, w4, w5
> 	  b.any   .L2
> 	  ret
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
> 	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
> 	* config/aarch64/iterators.md (sve_rot1, sve_rot2): New.
> 
> --
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index cd79aba90ec9cdb5da9e9758495015ef36b2d869..12bc8077994f5a130ff4af6e9bfa7ca1237d0868 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5109,6 +5109,20 @@  (define_expand "@cond_<optab><mode>"
   "TARGET_SVE"
 )
 
+;; Predicated FCADD using ptrue for unpredicated optab for auto-vectorizer
+(define_expand "@cadd<rot><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")]
+	  SVE_COND_FCADD))]
+  "TARGET_SVE"
+{
+  operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+})
+
 ;; Predicated FCADD, merging with the first input.
 (define_insn_and_rewrite "*cond_<optab><mode>_2"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
@@ -6554,6 +6568,62 @@  (define_insn "@aarch64_pred_<optab><mode>"
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 4)
+	   (match_dup 5)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_operand:SVE_FULL_F 3 "register_operand")]
+	  FCMLA_OP))]
+  "TARGET_SVE"
+{
+  operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[4],
+					    operands[1], operands[2],
+					    operands[3], operands[5]));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
+					    operands[0], operands[2],
+					    operands[3], operands[5]));
+  DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (match_dup 4)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_dup 5)]
+	  FCMUL_OP))]
+  "TARGET_SVE"
+{
+  operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[4] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  operands[5] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[3], operands[1],
+					    operands[2], operands[5], operands[4]));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[3], operands[1],
+					    operands[2], operands[0],
+					    operands[4]));
+  DONE;
+})
+
 ;; Predicated FCMLA with merging.
 (define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_FULL_F 0 "register_operand")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 98217c9fd3ee2b6063f7564193e400e9ef71c6ac..7662b929e2c4f6c103cc06e051eb574247320809 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3443,6 +3443,35 @@  (define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
 			    (UNSPEC_FCMLS "180")
 			    (UNSPEC_FCMLS180 "180")])
 
+;; SVE has slightly different namings from NEON so we have to split these
+;; iterators.
+(define_int_attr sve_rot1 [(UNSPEC_FCMLA "")
+			   (UNSPEC_FCMLA180 "")
+			   (UNSPEC_FCMUL "")
+			   (UNSPEC_FCMUL180 "")
+			   (UNSPEC_FCMLS "270")
+			   (UNSPEC_FCMLS180 "90")
+			   (UNSPEC_CMLA "")
+			   (UNSPEC_CMLA180 "")
+			   (UNSPEC_CMUL "")
+			   (UNSPEC_CMUL180 "")
+			   (UNSPEC_CMLS "270")
+			   (UNSPEC_CMLS180 "90")])
+
+(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90")
+			   (UNSPEC_FCMLA180 "270")
+			   (UNSPEC_FCMUL "90")
+			   (UNSPEC_FCMUL180 "270")
+			   (UNSPEC_FCMLS "180")
+			   (UNSPEC_FCMLS180 "180")
+			   (UNSPEC_CMLA "90")
+			   (UNSPEC_CMLA180 "270")
+			   (UNSPEC_CMUL "90")
+			   (UNSPEC_CMUL180 "270")
+			   (UNSPEC_CMLS "180")
+			   (UNSPEC_CMLS180 "180")])
+
+
 (define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a")
 			 (UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s")
 			 (UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")