diff mbox series

AArch64 Make use of FADDP in simple reductions.

Message ID patch-14681-tamar@arm.com
State New
Headers show
Series AArch64 Make use of FADDP in simple reductions. | expand

Commit Message

Tamar Christina Sept. 1, 2021, 9:59 a.m. UTC
Hi All,

This is a respin of an older patch which never got upstream reviewed by a
maintainer.  It's been updated to fit the current GCC codegen.

This patch adds a pattern to support the (F)ADDP (scalar) instruction.

Before the patch, the C code

typedef float v4sf __attribute__((vector_size (16)));

float
foo1 (v4sf x)
{
  return x[0] + x[1];
}

generated:

foo1:
	dup     s1, v0.s[1]
	fadd    s0, s1, s0
	ret

After patch:
foo1:
	faddp   s0, v0.2s
	ret

The double case is now handled by SLP but the remaining cases still need help
from combine.  I have kept the integer and floating point separate because of
the integer one only supports V2DI and sharing it with the float would have
required definition of a few new iterators for just a single use.

I provide support for when both elements are subregs as a different pattern
as there's no way to tell reload that the two registers must be equal with
just constraints.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
	*aarch64_addp_scalar2v2di): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
	* gcc.target/aarch64/simd/scalar_addp.c: New test.

Co-authored-by: Tamar Christina <tamar.christina@arm.com>

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644


--

Comments

Tamar Christina Sept. 8, 2021, 12:38 p.m. UTC | #1
Slightly updated patch correcting some modes that were giving warnings.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
	*aarch64_addp_scalar2v2di): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
	* gcc.target/aarch64/simd/scalar_addp.c: New test.

Co-authored-by: Tamar Christina <tamar.christina@arm.com>

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 371990fbe2cfb72d22f22ed582bb7ebdebb3edc0..408500f74c06b63368136a49087558d40972873e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3411,6 +3411,70 @@ (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_faddp_scalar2<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operator:VHSDF 1 "subreg_lowpart_operator"
+	      [(match_operand:VHSDF 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:<VEL> 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
+  "faddp\t%<Vetype>0, %2.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+(define_insn "*aarch64_faddp_scalar<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operand:VHSDF 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:<VEL> 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "faddp\t%<Vetype>0, %1.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_addp_scalar2v2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operator:V2DI 1 "subreg_lowpart_operator"
+	      [(match_operand:V2DI 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:DI 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
+  "addp\t%d0, %2.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
+(define_insn "*aarch64_addp_scalarv2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operand:V2DI 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:DI 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "addp\t%d0, %1.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,11 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,44 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { scan-assembler-times "dup" 4 } } */
+
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
+
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
+
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */
+
+float
+foo3 (v4sf x)
+{
+  return x[2] + x[3];
+}
+
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times "faddp" 1 } } */
+

> -----Original Message-----
> From: Tamar Christina <tamar.christina@arm.com>
> Sent: Wednesday, September 1, 2021 11:00 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH]AArch64 Make use of FADDP in simple reductions.
> 
> Hi All,
> 
> This is a respin of an older patch which never got upstream reviewed by a
> maintainer.  It's been updated to fit the current GCC codegen.
> 
> This patch adds a pattern to support the (F)ADDP (scalar) instruction.
> 
> Before the patch, the C code
> 
> typedef float v4sf __attribute__((vector_size (16)));
> 
> float
> foo1 (v4sf x)
> {
>   return x[0] + x[1];
> }
> 
> generated:
> 
> foo1:
> 	dup     s1, v0.s[1]
> 	fadd    s0, s1, s0
> 	ret
> 
> After patch:
> foo1:
> 	faddp   s0, v0.2s
> 	ret
> 
> The double case is now handled by SLP but the remaining cases still need
> help from combine.  I have kept the integer and floating point separate
> because of the integer one only supports V2DI and sharing it with the float
> would have required definition of a few new iterators for just a single use.
> 
> I provide support for when both elements are subregs as a different pattern
> as there's no way to tell reload that the two registers must be equal with just
> constraints.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
> 	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
> 	*aarch64_addp_scalar2v2di): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
> 	* gcc.target/aarch64/simd/scalar_addp.c: New test.
> 
> Co-authored-by: Tamar Christina <tamar.christina@arm.com>
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a
> 9384424f44bde05 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
>    [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
>  )
> 
> +;; For the case where both operands are a subreg we need to use a ;;
> +match_dup since reload cannot enforce that the registers are ;; the
> +same with a constraint in this case.
> +(define_insn "*aarch64_faddp_scalar2<mode>"
> +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> +	(plus:<VEL>
> +	  (vec_select:<VEL>
> +	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
> +	      [(match_operand:VHSDF 2 "register_operand" "w")])
> +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
> +	  (match_dup:<VEL> 2)))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
> +  "faddp\t%<Vetype>0, %2.2<Vetype>"
> +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> +)
> +
> +(define_insn "*aarch64_faddp_scalar<mode>"
> +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> +	(plus:<VEL>
> +	  (vec_select:<VEL>
> +	    (match_operand:VHSDF 1 "register_operand" "w")
> +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
> +	  (match_operand:<VEL> 3 "register_operand" "1")))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
> +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> +   && subreg_lowpart_p (operands[3])"
> +  "faddp\t%<Vetype>0, %1.2<Vetype>"
> +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> +)
> +
> +;; For the case where both operands are a subreg we need to use a ;;
> +match_dup since reload cannot enforce that the registers are ;; the
> +same with a constraint in this case.
> +(define_insn "*aarch64_addp_scalar2v2di"
> +  [(set (match_operand:DI 0 "register_operand" "=w")
> +	(plus:DI
> +	  (vec_select:DI
> +	    (match_operator:DI 1 "subreg_lowpart_operator"
> +	      [(match_operand:V2DI 2 "register_operand" "w")])
> +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
> +	  (match_dup:DI 2)))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
> +  "addp\t%d0, %2.2d"
> +  [(set_attr "type" "neon_reduc_add_long")]
> +)
> +
> +(define_insn "*aarch64_addp_scalarv2di"
> +  [(set (match_operand:DI 0 "register_operand" "=w")
> +	(plus:DI
> +	  (vec_select:DI
> +	    (match_operand:V2DI 1 "register_operand" "w")
> +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
> +	  (match_operand:DI 3 "register_operand" "1")))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
> +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> +   && subreg_lowpart_p (operands[3])"
> +  "addp\t%d0, %1.2d"
> +  [(set_attr "type" "neon_reduc_add_long")]
> +)
> +
>  (define_insn "aarch64_reduc_plus_internal<mode>"
>   [(set (match_operand:VDQV 0 "register_operand" "=w")
>         (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")] diff -
> -git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e
> 6b76c0716344ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> @@ -0,0 +1,11 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +
> +long long
> +foo (v2di x)
> +{
> +  return x[1] + x[0];
> +}
> +/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } }
> +*/
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc
> 61426ba7b9ef91
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> @@ -0,0 +1,44 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { scan-assembler-times "dup" 4 } } */
> +
> +
> +typedef double v2df __attribute__((vector_size (16))); typedef float
> +v4sf __attribute__((vector_size (16))); typedef __fp16 v8hf
> +__attribute__((vector_size (16)));
> +
> +double
> +foo (v2df x)
> +{
> +  return x[1] + x[0];
> +}
> +/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } }
> +*/
> +
> +float
> +foo1 (v4sf x)
> +{
> +  return x[0] + x[1];
> +}
> +/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } }
> +*/
> +
> +__fp16
> +foo2 (v8hf x)
> +{
> +  return x[0] + x[1];
> +}
> +/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } }
> +*/
> +
> +float
> +foo3 (v4sf x)
> +{
> +  return x[2] + x[3];
> +}
> +
> +__fp16
> +foo4 (v8hf x)
> +{
> +  return x[6] + x[7];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ece
> fdc0dbccb16bd5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -w" } */
> +
> +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
> +double a[]; *b;
> +fn1() {
> +  __m128i c;
> +  *(__m128i *)a = c;
> +  *b = a[0] + a[1];
> +}
> +
> +/* { dg-final { scan-assembler-times "faddp" 1 } } */
> +
> 
> 
> --
Richard Sandiford Oct. 8, 2021, 4:24 p.m. UTC | #2
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This is a respin of an older patch which never got upstream reviewed by a
> maintainer.  It's been updated to fit the current GCC codegen.
>
> This patch adds a pattern to support the (F)ADDP (scalar) instruction.
>
> Before the patch, the C code
>
> typedef float v4sf __attribute__((vector_size (16)));
>
> float
> foo1 (v4sf x)
> {
>   return x[0] + x[1];
> }
>
> generated:
>
> foo1:
> 	dup     s1, v0.s[1]
> 	fadd    s0, s1, s0
> 	ret
>
> After patch:
> foo1:
> 	faddp   s0, v0.2s
> 	ret
>
> The double case is now handled by SLP but the remaining cases still need help
> from combine.  I have kept the integer and floating point separate because of
> the integer one only supports V2DI and sharing it with the float would have
> required definition of a few new iterators for just a single use.
>
> I provide support for when both elements are subregs as a different pattern
> as there's no way to tell reload that the two registers must be equal with
> just constraints.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
> 	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
> 	*aarch64_addp_scalar2v2di): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
> 	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
> 	* gcc.target/aarch64/simd/scalar_addp.c: New test.
>
> Co-authored-by: Tamar Christina <tamar.christina@arm.com>
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
>    [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
>  )
>  
> +;; For the case where both operands are a subreg we need to use a
> +;; match_dup since reload cannot enforce that the registers are
> +;; the same with a constraint in this case.
> +(define_insn "*aarch64_faddp_scalar2<mode>"
> +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> +	(plus:<VEL>
> +	  (vec_select:<VEL>
> +	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
> +	      [(match_operand:VHSDF 2 "register_operand" "w")])
> +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
> +	  (match_dup:<VEL> 2)))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
> +  "faddp\t%<Vetype>0, %2.2<Vetype>"
> +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> +)

The difficulty with using match_dup here is that the first
vec_select operand ought to fold to a REG after reload,
rather than stay as a subreg.  From that POV we're forcing
the generation of non-canonical rtl.

Also…

> +(define_insn "*aarch64_faddp_scalar<mode>"
> +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> +	(plus:<VEL>
> +	  (vec_select:<VEL>
> +	    (match_operand:VHSDF 1 "register_operand" "w")
> +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
> +	  (match_operand:<VEL> 3 "register_operand" "1")))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
> +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> +   && subreg_lowpart_p (operands[3])"
> +  "faddp\t%<Vetype>0, %1.2<Vetype>"
> +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> +)

…matching constraints don't work reliably between two inputs:
the RA doesn't know how to combine two different inputs into
one input in order to make them match.

Have you tried doing this as a define_peephole2 instead?
That fits this kind of situation better (from an rtl representation
point of view), but peephole2s are admittedly less powerful than combine.

If peephole2s don't work then I think we'll have to provide
a pattern that accepts two distinct inputs and then split the
instruction if the inputs aren't in the same register.  That sounds
a bit ugly though, so it'd be good news if the peephole thing works out.

Thanks,
Richard

> +
> +;; For the case where both operands are a subreg we need to use a
> +;; match_dup since reload cannot enforce that the registers are
> +;; the same with a constraint in this case.
> +(define_insn "*aarch64_addp_scalar2v2di"
> +  [(set (match_operand:DI 0 "register_operand" "=w")
> +	(plus:DI
> +	  (vec_select:DI
> +	    (match_operator:DI 1 "subreg_lowpart_operator"
> +	      [(match_operand:V2DI 2 "register_operand" "w")])
> +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
> +	  (match_dup:DI 2)))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
> +  "addp\t%d0, %2.2d"
> +  [(set_attr "type" "neon_reduc_add_long")]
> +)
> +
> +(define_insn "*aarch64_addp_scalarv2di"
> +  [(set (match_operand:DI 0 "register_operand" "=w")
> +	(plus:DI
> +	  (vec_select:DI
> +	    (match_operand:V2DI 1 "register_operand" "w")
> +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
> +	  (match_operand:DI 3 "register_operand" "1")))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
> +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> +   && subreg_lowpart_p (operands[3])"
> +  "addp\t%d0, %1.2d"
> +  [(set_attr "type" "neon_reduc_add_long")]
> +)
> +
>  (define_insn "aarch64_reduc_plus_internal<mode>"
>   [(set (match_operand:VDQV 0 "register_operand" "=w")
>         (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> @@ -0,0 +1,11 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +
> +long long
> +foo (v2di x)
> +{
> +  return x[1] + x[0];
> +}
> +/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> @@ -0,0 +1,44 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { scan-assembler-times "dup" 4 } } */
> +
> +
> +typedef double v2df __attribute__((vector_size (16)));
> +typedef float v4sf __attribute__((vector_size (16)));
> +typedef __fp16 v8hf __attribute__((vector_size (16)));
> +
> +double
> +foo (v2df x)
> +{
> +  return x[1] + x[0];
> +}
> +/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
> +
> +float
> +foo1 (v4sf x)
> +{
> +  return x[0] + x[1];
> +}
> +/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
> +
> +__fp16
> +foo2 (v8hf x)
> +{
> +  return x[0] + x[1];
> +}
> +/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */
> +
> +float
> +foo3 (v4sf x)
> +{
> +  return x[2] + x[3];
> +}
> +
> +__fp16
> +foo4 (v8hf x)
> +{
> +  return x[6] + x[7];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -w" } */
> +
> +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
> +double a[];
> +*b;
> +fn1() {
> +  __m128i c;
> +  *(__m128i *)a = c;
> +  *b = a[0] + a[1];
> +}
> +
> +/* { dg-final { scan-assembler-times "faddp" 1 } } */
> +
Tamar Christina Nov. 1, 2021, 7:39 a.m. UTC | #3
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Friday, October 8, 2021 5:24 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 Make use of FADDP in simple reductions.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > This is a respin of an older patch which never got upstream reviewed
> > by a maintainer.  It's been updated to fit the current GCC codegen.
> >
> > This patch adds a pattern to support the (F)ADDP (scalar) instruction.
> >
> > Before the patch, the C code
> >
> > typedef float v4sf __attribute__((vector_size (16)));
> >
> > float
> > foo1 (v4sf x)
> > {
> >   return x[0] + x[1];
> > }
> >
> > generated:
> >
> > foo1:
> > 	dup     s1, v0.s[1]
> > 	fadd    s0, s1, s0
> > 	ret
> >
> > After patch:
> > foo1:
> > 	faddp   s0, v0.2s
> > 	ret
> >
> > The double case is now handled by SLP but the remaining cases still
> > need help from combine.  I have kept the integer and floating point
> > separate because of the integer one only supports V2DI and sharing it
> > with the float would have required definition of a few new iterators for just
> a single use.
> >
> > I provide support for when both elements are subregs as a different
> > pattern as there's no way to tell reload that the two registers must
> > be equal with just constraints.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
> > 	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
> > 	*aarch64_addp_scalar2v2di): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
> > 	* gcc.target/aarch64/simd/scalar_addp.c: New test.
> >
> > Co-authored-by: Tamar Christina <tamar.christina@arm.com>
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a
> 938
> > 4424f44bde05 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
> >    [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> >  )
> >
> > +;; For the case where both operands are a subreg we need to use a ;;
> > +match_dup since reload cannot enforce that the registers are ;; the
> > +same with a constraint in this case.
> > +(define_insn "*aarch64_faddp_scalar2<mode>"
> > +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> > +	(plus:<VEL>
> > +	  (vec_select:<VEL>
> > +	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
> > +	      [(match_operand:VHSDF 2 "register_operand" "w")])
> > +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
> > +	  (match_dup:<VEL> 2)))]
> > +  "TARGET_SIMD
> > +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
> > +  "faddp\t%<Vetype>0, %2.2<Vetype>"
> > +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> > +)
> 
> The difficulty with using match_dup here is that the first vec_select operand
> ought to fold to a REG after reload, rather than stay as a subreg.  From that
> POV we're forcing the generation of non-canonical rtl.
> 
> Also…
> 
> > +(define_insn "*aarch64_faddp_scalar<mode>"
> > +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> > +	(plus:<VEL>
> > +	  (vec_select:<VEL>
> > +	    (match_operand:VHSDF 1 "register_operand" "w")
> > +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
> > +	  (match_operand:<VEL> 3 "register_operand" "1")))]
> > +  "TARGET_SIMD
> > +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
> > +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> > +   && subreg_lowpart_p (operands[3])"
> > +  "faddp\t%<Vetype>0, %1.2<Vetype>"
> > +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> > +)
> 
> …matching constraints don't work reliably between two inputs:
> the RA doesn't know how to combine two different inputs into one input in
> order to make them match.
> 
> Have you tried doing this as a define_peephole2 instead?
> That fits this kind of situation better (from an rtl representation point of
> view), but peephole2s are admittedly less powerful than combine.
> 
> If peephole2s don't work then I think we'll have to provide a pattern that
> accepts two distinct inputs and then split the instruction if the inputs aren't in
> the same register.  That sounds a bit ugly though, so it'd be good news if the
> peephole thing works out.

Unfortunately peepholes don't work very well here because e.g. addp can be
Assigned by the regalloc to the integer side instead of simd, in which case you
Can't use the instruction anymore.

The peepholes seem to only detect the simple FP cases.

I tried adding something like a post-reload spit

+  "&& reload_completed && REGNO (operands[1]) != REGNO (operands[3])"
+  [(clobber (match_scratch:<VEL> 4 "=w"))
+   (set (match_dup 4)
+       (vec_select:<VEL>
+         (match_dup 1)
+         (parallel [(match_dup 2)])))
+   (set (match_dup 0)
+       (plus:<VEL>
+         (match_dup 4)
+         (match_dup 3)))]
+  ""

But this doesn't seem like it'll work as it needs a scratch?

Thanks,
Tamar

> 
> Thanks,
> Richard
> 
> > +
> > +;; For the case where both operands are a subreg we need to use a ;;
> > +match_dup since reload cannot enforce that the registers are ;; the
> > +same with a constraint in this case.
> > +(define_insn "*aarch64_addp_scalar2v2di"
> > +  [(set (match_operand:DI 0 "register_operand" "=w")
> > +	(plus:DI
> > +	  (vec_select:DI
> > +	    (match_operator:DI 1 "subreg_lowpart_operator"
> > +	      [(match_operand:V2DI 2 "register_operand" "w")])
> > +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
> > +	  (match_dup:DI 2)))]
> > +  "TARGET_SIMD
> > +   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
> > +  "addp\t%d0, %2.2d"
> > +  [(set_attr "type" "neon_reduc_add_long")]
> > +)
> > +
> > +(define_insn "*aarch64_addp_scalarv2di"
> > +  [(set (match_operand:DI 0 "register_operand" "=w")
> > +	(plus:DI
> > +	  (vec_select:DI
> > +	    (match_operand:V2DI 1 "register_operand" "w")
> > +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
> > +	  (match_operand:DI 3 "register_operand" "1")))]
> > +  "TARGET_SIMD
> > +   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
> > +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> > +   && subreg_lowpart_p (operands[3])"
> > +  "addp\t%d0, %1.2d"
> > +  [(set_attr "type" "neon_reduc_add_long")]
> > +)
> > +
> >  (define_insn "aarch64_reduc_plus_internal<mode>"
> >   [(set (match_operand:VDQV 0 "register_operand" "=w")
> >         (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e
> 6b
> > 76c0716344ae
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> > +
> > +typedef long long v2di __attribute__((vector_size (16)));
> > +
> > +long long
> > +foo (v2di x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 }
> > +} */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc
> 61
> > 426ba7b9ef91
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> > @@ -0,0 +1,44 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> > +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> > +/* { dg-additional-options "-save-temps -O1" } */
> > +/* { dg-final { scan-assembler-times "dup" 4 } } */
> > +
> > +
> > +typedef double v2df __attribute__((vector_size (16))); typedef float
> > +v4sf __attribute__((vector_size (16))); typedef __fp16 v8hf
> > +__attribute__((vector_size (16)));
> > +
> > +double
> > +foo (v2df x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 }
> > +} */
> > +
> > +float
> > +foo1 (v4sf x)
> > +{
> > +  return x[0] + x[1];
> > +}
> > +/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 }
> > +} */
> > +
> > +__fp16
> > +foo2 (v8hf x)
> > +{
> > +  return x[0] + x[1];
> > +}
> > +/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 }
> > +} */
> > +
> > +float
> > +foo3 (v4sf x)
> > +{
> > +  return x[2] + x[3];
> > +}
> > +
> > +__fp16
> > +foo4 (v8hf x)
> > +{
> > +  return x[6] + x[7];
> > +}
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ece
> fd
> > c0dbccb16bd5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -w" } */
> > +
> > +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
> > +double a[]; *b;
> > +fn1() {
> > +  __m128i c;
> > +  *(__m128i *)a = c;
> > +  *b = a[0] + a[1];
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "faddp" 1 } } */
> > +
Richard Sandiford Nov. 1, 2021, 8:49 a.m. UTC | #4
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Friday, October 8, 2021 5:24 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH]AArch64 Make use of FADDP in simple reductions.
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > This is a respin of an older patch which never got upstream reviewed
>> > by a maintainer.  It's been updated to fit the current GCC codegen.
>> >
>> > This patch adds a pattern to support the (F)ADDP (scalar) instruction.
>> >
>> > Before the patch, the C code
>> >
>> > typedef float v4sf __attribute__((vector_size (16)));
>> >
>> > float
>> > foo1 (v4sf x)
>> > {
>> >   return x[0] + x[1];
>> > }
>> >
>> > generated:
>> >
>> > foo1:
>> > 	dup     s1, v0.s[1]
>> > 	fadd    s0, s1, s0
>> > 	ret
>> >
>> > After patch:
>> > foo1:
>> > 	faddp   s0, v0.2s
>> > 	ret
>> >
>> > The double case is now handled by SLP but the remaining cases still
>> > need help from combine.  I have kept the integer and floating point
>> > separate because of the integer one only supports V2DI and sharing it
>> > with the float would have required definition of a few new iterators for just
>> a single use.
>> >
>> > I provide support for when both elements are subregs as a different
>> > pattern as there's no way to tell reload that the two registers must
>> > be equal with just constraints.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> > 	* config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
>> > 	*aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
>> > 	*aarch64_addp_scalar2v2di): New.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> > 	* gcc.target/aarch64/simd/scalar_faddp.c: New test.
>> > 	* gcc.target/aarch64/simd/scalar_faddp2.c: New test.
>> > 	* gcc.target/aarch64/simd/scalar_addp.c: New test.
>> >
>> > Co-authored-by: Tamar Christina <tamar.christina@arm.com>
>> >
>> > --- inline copy of patch --
>> > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> > b/gcc/config/aarch64/aarch64-simd.md
>> > index
>> >
>> 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a
>> 938
>> > 4424f44bde05 100644
>> > --- a/gcc/config/aarch64/aarch64-simd.md
>> > +++ b/gcc/config/aarch64/aarch64-simd.md
>> > @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
>> >    [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
>> >  )
>> >
>> > +;; For the case where both operands are a subreg we need to use a ;;
>> > +match_dup since reload cannot enforce that the registers are ;; the
>> > +same with a constraint in this case.
>> > +(define_insn "*aarch64_faddp_scalar2<mode>"
>> > +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
>> > +	(plus:<VEL>
>> > +	  (vec_select:<VEL>
>> > +	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
>> > +	      [(match_operand:VHSDF 2 "register_operand" "w")])
>> > +	    (parallel [(match_operand 3 "const_int_operand" "n")]))
>> > +	  (match_dup:<VEL> 2)))]
>> > +  "TARGET_SIMD
>> > +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
>> > +  "faddp\t%<Vetype>0, %2.2<Vetype>"
>> > +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
>> > +)
>> 
>> The difficulty with using match_dup here is that the first vec_select operand
>> ought to fold to a REG after reload, rather than stay as a subreg.  From that
>> POV we're forcing the generation of non-canonical rtl.
>> 
>> Also…
>> 
>> > +(define_insn "*aarch64_faddp_scalar<mode>"
>> > +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
>> > +	(plus:<VEL>
>> > +	  (vec_select:<VEL>
>> > +	    (match_operand:VHSDF 1 "register_operand" "w")
>> > +	    (parallel [(match_operand 2 "const_int_operand" "n")]))
>> > +	  (match_operand:<VEL> 3 "register_operand" "1")))]
>> > +  "TARGET_SIMD
>> > +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
>> > +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
>> > +   && subreg_lowpart_p (operands[3])"
>> > +  "faddp\t%<Vetype>0, %1.2<Vetype>"
>> > +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
>> > +)

Also:

It'd probably be better to use V2F for the iterator, since it excludes
V4 and V8 modes.

I think we can use vect_par_cnst_hi_half for operand 2.

>> …matching constraints don't work reliably between two inputs:
>> the RA doesn't know how to combine two different inputs into one input in
>> order to make them match.
>> 
>> Have you tried doing this as a define_peephole2 instead?
>> That fits this kind of situation better (from an rtl representation point of
>> view), but peephole2s are admittedly less powerful than combine.
>> 
>> If peephole2s don't work then I think we'll have to provide a pattern that
>> accepts two distinct inputs and then split the instruction if the inputs aren't in
>> the same register.  That sounds a bit ugly though, so it'd be good news if the
>> peephole thing works out.
>
> Unfortunately peepholes don't work very well here because e.g. addp can be
> Assigned by the regalloc to the integer side instead of simd, in which case you
> Can't use the instruction anymore.

Ah, yeah, we wouldn't be able to recover easily from that.

> The peepholes seem to only detect the simple FP cases.
>
> I tried adding something like a post-reload spit
>
> +  "&& reload_completed && REGNO (operands[1]) != REGNO (operands[3])"
> +  [(clobber (match_scratch:<VEL> 4 "=w"))
> +   (set (match_dup 4)
> +       (vec_select:<VEL>
> +         (match_dup 1)
> +         (parallel [(match_dup 2)])))
> +   (set (match_dup 0)
> +       (plus:<VEL>
> +         (match_dup 4)
> +         (match_dup 3)))]
> +  ""
>
> But this doesn't seem like it'll work as it needs a scratch?

The clobber should go in the pattern itself, rather than in the split.  E.g.:

  [(set (match_operand:<VEL> 0 "register_operand" "=w")
	(plus:<VEL>
	  (vec_select:<VEL>
	    (match_operand:V2F 1 "register_operand" "w")
	    (match_operand 2 "vect_par_cnst_hi_half"))
	  (match_operand:<VEL> 3 "register_operand" "w")))
   (clobber (match_scratch:V2F 4 "=&w"))]

recog will add the clobber when needed.

However, it's probably simpler to make operand 0 itself earlyclobber:

  [(set (match_operand:<VEL> 0 "register_operand" "=&w")
	(plus:<VEL>
	  (vec_select:<VEL>
	    (match_operand:V2F 1 "register_operand" "w")
	    (match_operand 2 "vect_par_cnst_hi_half"))
	  (match_operand:<VEL> 3 "register_operand" "w")))]

The output code should return "#" if the register numbers are different.

All of this complication makes me think: couldn't we match this in
gimple instead?  I.e. check for an addition of the elements in a
2-element vector and match it to IFN_REDUC_PLUS (when supported)?

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3414,6 +3414,70 @@  (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_faddp_scalar2<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operator:<VEL> 1 "subreg_lowpart_operator"
+	      [(match_operand:VHSDF 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:<VEL> 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
+  "faddp\t%<Vetype>0, %2.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+(define_insn "*aarch64_faddp_scalar<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(plus:<VEL>
+	  (vec_select:<VEL>
+	    (match_operand:VHSDF 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:<VEL> 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "faddp\t%<Vetype>0, %1.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_addp_scalar2v2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operator:DI 1 "subreg_lowpart_operator"
+	      [(match_operand:V2DI 2 "register_operand" "w")])
+	    (parallel [(match_operand 3 "const_int_operand" "n")]))
+	  (match_dup:DI 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
+  "addp\t%d0, %2.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
+(define_insn "*aarch64_addp_scalarv2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(plus:DI
+	  (vec_select:DI
+	    (match_operand:V2DI 1 "register_operand" "w")
+	    (parallel [(match_operand 2 "const_int_operand" "n")]))
+	  (match_operand:DI 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "addp\t%d0, %1.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,11 @@ 
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,44 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { scan-assembler-times "dup" 4 } } */
+
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
+
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
+
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */
+
+float
+foo3 (v4sf x)
+{
+  return x[2] + x[3];
+}
+
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@ 
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times "faddp" 1 } } */
+