diff mbox series

[2/4] AArch64: Add support for sign differing dot-product usdot for NEON and SVE.

Message ID 20210505173854.GA17884@arm.com
State New
Headers show
Series [1/4] middle-end Vect: Add support for dot-product where the sign for the multiplicant changes. | expand

Commit Message

Tamar Christina May 5, 2021, 5:38 p.m. UTC
Hi All,

This adds optabs implementing usdot_prod.

The following testcase:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

Generates for NEON

f:
        movi    v0.4s, 0
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     q1, [x2, x3]
        ldr     q2, [x1, x3]
        usdot   v0.4s, v1.16b, v2.16b
        add     x3, x3, 16
        cmp     x3, 480
        bne     .L2
        addv    s0, v0.4s
        fmov    w1, s0
        add     w0, w0, w1
        ret

and for SVE

f:
        mov     x3, 0
        cntb    x5
        mov     w4, 480
        mov     z1.b, #0
        whilelo p0.b, wzr, w4
        mov     z3.b, #0
        ptrue   p1.b, all
        .p2align 3,,7
.L2:
        ld1b    z2.b, p0/z, [x1, x3]
        ld1b    z0.b, p0/z, [x2, x3]
        add     x3, x3, x5
        sel     z0.b, p0, z0.b, z3.b
        whilelo p0.b, w3, w4
        usdot   z1.s, z0.b, z2.b
        b.any   .L2
        uaddv   d0, p1, z1.s
        fmov    x1, d0
        add     w0, w0, w1
        ret

instead of

f:
        movi    v0.4s, 0
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     q2, [x1, x3]
        ldr     q1, [x2, x3]
        add     x3, x3, 16
        sxtl    v4.8h, v2.8b
        sxtl2   v3.8h, v2.16b
        uxtl    v2.8h, v1.8b
        uxtl2   v1.8h, v1.16b
        mul     v2.8h, v2.8h, v4.8h
        mul     v1.8h, v1.8h, v3.8h
        saddw   v0.4s, v0.4s, v2.4h
        saddw2  v0.4s, v0.4s, v2.8h
        saddw   v0.4s, v0.4s, v1.4h
        saddw2  v0.4s, v0.4s, v1.8h
        cmp     x3, 480
        bne     .L2
        addv    s0, v0.4s
        fmov    w1, s0
        add     w0, w0, w1
        ret

and

f:
        mov     x3, 0
        cnth    x5
        mov     w4, 480
        mov     z1.b, #0
        whilelo p0.h, wzr, w4
        ptrue   p2.b, all
        .p2align 3,,7
.L2:
        ld1sb   z2.h, p0/z, [x1, x3]
        punpklo p1.h, p0.b
        ld1b    z0.h, p0/z, [x2, x3]
        add     x3, x3, x5
        mul     z0.h, p2/m, z0.h, z2.h
        sunpklo z2.s, z0.h
        sunpkhi z0.s, z0.h
        add     z1.s, p1/m, z1.s, z2.s
        punpkhi p1.h, p0.b
        whilelo p0.h, w3, w4
        add     z1.s, p1/m, z1.s, z0.s
        b.any   .L2
        uaddv   d0, p2, z1.s
        fmov    x1, d0
        add     w0, w0, w1
        ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): New.
	* config/aarch64/aarch64-sve.md (@aarch64_<sur>dot_prod<vsi2qi>):
	Rename to...
	(@<sur>dot_prod<vsi2qi>): ...This.
	* config/aarch64/aarch64-sve-builtins-base.cc
	(svusdot_impl::expand): Use it.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/vusdot-autovec.c: New test.
	* gcc.target/aarch64/sve/vusdot-autovec.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4edee99051c4e2112b546becca47da32aae21df2..c9fb8e702732dd311fb10de17126432e2a63a32b 100644


--

Comments

Richard Sandiford May 10, 2021, 4:49 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 4edee99051c4e2112b546becca47da32aae21df2..c9fb8e702732dd311fb10de17126432e2a63a32b 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -648,6 +648,22 @@ (define_expand "<sur>dot_prod<vsi2qi>"
>    DONE;
>  })
>  
> +;; Auto-vectorizer pattern for usdot
> +(define_expand "usdot_prod<vsi2qi>"
> +  [(set (match_operand:VS 0 "register_operand")
> +	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> +			    (match_operand:<VSI2QI> 2 "register_operand")]
> +		 UNSPEC_USDOT)
> +		(match_operand:VS 3 "register_operand")))]
> +  "TARGET_I8MM"
> +{
> +  emit_insn (
> +    gen_aarch64_usdot<vsi2qi> (operands[3], operands[3], operands[1],
> +			       operands[2]));
> +  emit_move_insn (operands[0], operands[3]);
> +  DONE;
> +})

We can't modify operands[3] here; it's an input rather than an output.

It looks like this would work with just the {…} removed though.
The pattern will match aarch64_usdot<vsi2qi> on its own accord.

Even better would be to rename __builtin_aarch64_usdot… to
__builtin_usdot_prod…, change its arguments so that they line up
with the optabs, and change arm_neon.h to match.

> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b99a945903c043c7410becaf6f09496dd038410d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
> +
> +#define N 480
> +#define SIGNEDNESS_1 unsigned
> +#define SIGNEDNESS_2 signed
> +#define SIGNEDNESS_3 signed
> +#define SIGNEDNESS_4 unsigned
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
> +   SIGNEDNESS_4 char *restrict b)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
> +   SIGNEDNESS_4 char *restrict a)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..094dd51cea62e0ba05ec3505657bf05320e5fdbb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */
> +
> +#define N 480
> +#define SIGNEDNESS_1 unsigned
> +#define SIGNEDNESS_2 signed
> +#define SIGNEDNESS_3 signed
> +#define SIGNEDNESS_4 unsigned
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
> +   SIGNEDNESS_4 char *restrict b)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
> +   SIGNEDNESS_4 char *restrict a)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */

Guess this is personal preference, but I don't think the SIGNEDNESS_*
macros add anything when used like this.  I remember doing something
similar in the past when including .c files from other .c files(!)
in order to avoid cut-&-paste, but there doesn't seem much benefit
for standalone files like these.

Thanks,
Richard
Tamar Christina May 25, 2021, 2:57 p.m. UTC | #2
Hi Richard,

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, May 10, 2021 5:49 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/4]AArch64: Add support for sign differing dot-product
> usdot for NEON and SVE.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 4edee99051c4e2112b546becca47da32aae21df2..c9fb8e702732dd311fb10de1
> 7126
> > 432e2a63a32b 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -648,6 +648,22 @@ (define_expand "<sur>dot_prod<vsi2qi>"
> >    DONE;
> >  })
> >
> > +;; Auto-vectorizer pattern for usdot
> > +(define_expand "usdot_prod<vsi2qi>"
> > +  [(set (match_operand:VS 0 "register_operand")
> > +	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1
> "register_operand")
> > +			    (match_operand:<VSI2QI> 2 "register_operand")]
> > +		 UNSPEC_USDOT)
> > +		(match_operand:VS 3 "register_operand")))]
> > +  "TARGET_I8MM"
> > +{
> > +  emit_insn (
> > +    gen_aarch64_usdot<vsi2qi> (operands[3], operands[3], operands[1],
> > +			       operands[2]));
> > +  emit_move_insn (operands[0], operands[3]);
> > +  DONE;
> > +})
> 
> We can't modify operands[3] here; it's an input rather than an output.

Sorry, I should have noticed this.. I had blindly copied the existing pattern for dot-product and that looks like it's wrong.
I'll send a different patch to fix that one.

> 
> It looks like this would work with just the {…} removed though.
> The pattern will match aarch64_usdot<vsi2qi> on its own accord.
> 
> Even better would be to rename __builtin_aarch64_usdot… to
> __builtin_usdot_prod…, change its arguments so that they line up with the
> optabs, and change arm_neon.h to match.
> 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..b99a945903c043c7410becaf6f
> 09
> > 496dd038410d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
> > +
> > +#define N 480
> > +#define SIGNEDNESS_1 unsigned
> > +#define SIGNEDNESS_2 signed
> > +#define SIGNEDNESS_3 signed
> > +#define SIGNEDNESS_4 unsigned
> > +
> > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > +SIGNEDNESS_3 char *restrict a,
> > +   SIGNEDNESS_4 char *restrict b)
> > +{
> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > +    {
> > +      int av = a[i];
> > +      int bv = b[i];
> > +      SIGNEDNESS_2 short mult = av * bv;
> > +      res += mult;
> > +    }
> > +  return res;
> > +}
> > +
> > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res,
> > +SIGNEDNESS_3 char *restrict b,
> > +   SIGNEDNESS_4 char *restrict a)
> > +{
> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > +    {
> > +      int av = a[i];
> > +      int bv = b[i];
> > +      SIGNEDNESS_2 short mult = av * bv;
> > +      res += mult;
> > +    }
> > +  return res;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..094dd51cea62e0ba05ec35056
> 57b
> > f05320e5fdbb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */
> > +
> > +#define N 480
> > +#define SIGNEDNESS_1 unsigned
> > +#define SIGNEDNESS_2 signed
> > +#define SIGNEDNESS_3 signed
> > +#define SIGNEDNESS_4 unsigned
> > +
> > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > +SIGNEDNESS_3 char *restrict a,
> > +   SIGNEDNESS_4 char *restrict b)
> > +{
> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > +    {
> > +      int av = a[i];
> > +      int bv = b[i];
> > +      SIGNEDNESS_2 short mult = av * bv;
> > +      res += mult;
> > +    }
> > +  return res;
> > +}
> > +
> > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res,
> > +SIGNEDNESS_3 char *restrict b,
> > +   SIGNEDNESS_4 char *restrict a)
> > +{
> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > +    {
> > +      int av = a[i];
> > +      int bv = b[i];
> > +      SIGNEDNESS_2 short mult = av * bv;
> > +      res += mult;
> > +    }
> > +  return res;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
> 
> Guess this is personal preference, but I don't think the SIGNEDNESS_*
> macros add anything when used like this.  I remember doing something
> similar in the past when including .c files from other .c files(!) in order to
> avoid cut-&-paste, but there doesn't seem much benefit for standalone files
> like these.

If it's the same to you, I do prefer this version, since it's identical to the mid-end tests,
It does allow when familiar with the  tests to just quickly see what it's testing.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (aarch64_usdot<vsi2qi>): Rename to...
	(usdot_prod<vsi2qi>): ... This.
	* config/aarch64/aarch64-simd-builtins.def (usdot): Rename to...
	(usdot_prod): ...This.
	* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Likewise.
	* config/aarch64/aarch64-sve.md (@aarch64_<sur>dot_prod<vsi2qi>):
	Rename to...
	(@<sur>dot_prod<vsi2qi>): ...This.
	* config/aarch64/aarch64-sve-builtins-base.cc
	(svusdot_impl::expand): Use it.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/vusdot-autovec.c: New test.
	* gcc.target/aarch64/sve/vusdot-autovec.c: New test.

> 
> Thanks,
> Richard
Richard Sandiford May 26, 2021, 8:50 a.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, May 10, 2021 5:49 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 2/4]AArch64: Add support for sign differing dot-product
>> usdot for NEON and SVE.
>>
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> > b/gcc/config/aarch64/aarch64-simd.md
>> > index
>> >
>> 4edee99051c4e2112b546becca47da32aae21df2..c9fb8e702732dd311fb10de1
>> 7126
>> > 432e2a63a32b 100644
>> > --- a/gcc/config/aarch64/aarch64-simd.md
>> > +++ b/gcc/config/aarch64/aarch64-simd.md
>> > @@ -648,6 +648,22 @@ (define_expand "<sur>dot_prod<vsi2qi>"
>> >    DONE;
>> >  })
>> >
>> > +;; Auto-vectorizer pattern for usdot
>> > +(define_expand "usdot_prod<vsi2qi>"
>> > +  [(set (match_operand:VS 0 "register_operand")
>> > +   (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1
>> "register_operand")
>> > +                       (match_operand:<VSI2QI> 2 "register_operand")]
>> > +            UNSPEC_USDOT)
>> > +           (match_operand:VS 3 "register_operand")))]
>> > +  "TARGET_I8MM"
>> > +{
>> > +  emit_insn (
>> > +    gen_aarch64_usdot<vsi2qi> (operands[3], operands[3], operands[1],
>> > +                          operands[2]));
>> > +  emit_move_insn (operands[0], operands[3]);
>> > +  DONE;
>> > +})
>>
>> We can't modify operands[3] here; it's an input rather than an output.
>
> Sorry, I should have noticed this.. I had blindly copied the existing pattern for dot-product and that looks like it's wrong.
> I'll send a different patch to fix that one.
>
>>
>> It looks like this would work with just the {…} removed though.
>> The pattern will match aarch64_usdot<vsi2qi> on its own accord.
>>
>> Even better would be to rename __builtin_aarch64_usdot… to
>> __builtin_usdot_prod…, change its arguments so that they line up with the
>> optabs, and change arm_neon.h to match.
>>
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
>> > b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
>> > new file mode 100644
>> > index
>> >
>> 0000000000000000000000000000000000000000..b99a945903c043c7410becaf6f
>> 09
>> > 496dd038410d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
>> > @@ -0,0 +1,38 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
>> > +
>> > +#define N 480
>> > +#define SIGNEDNESS_1 unsigned
>> > +#define SIGNEDNESS_2 signed
>> > +#define SIGNEDNESS_3 signed
>> > +#define SIGNEDNESS_4 unsigned
>> > +
>> > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
>> > +SIGNEDNESS_3 char *restrict a,
>> > +   SIGNEDNESS_4 char *restrict b)
>> > +{
>> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>> > +    {
>> > +      int av = a[i];
>> > +      int bv = b[i];
>> > +      SIGNEDNESS_2 short mult = av * bv;
>> > +      res += mult;
>> > +    }
>> > +  return res;
>> > +}
>> > +
>> > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res,
>> > +SIGNEDNESS_3 char *restrict b,
>> > +   SIGNEDNESS_4 char *restrict a)
>> > +{
>> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>> > +    {
>> > +      int av = a[i];
>> > +      int bv = b[i];
>> > +      SIGNEDNESS_2 short mult = av * bv;
>> > +      res += mult;
>> > +    }
>> > +  return res;
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
>> > b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
>> > new file mode 100644
>> > index
>> >
>> 0000000000000000000000000000000000000000..094dd51cea62e0ba05ec35056
>> 57b
>> > f05320e5fdbb
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
>> > @@ -0,0 +1,38 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */
>> > +
>> > +#define N 480
>> > +#define SIGNEDNESS_1 unsigned
>> > +#define SIGNEDNESS_2 signed
>> > +#define SIGNEDNESS_3 signed
>> > +#define SIGNEDNESS_4 unsigned
>> > +
>> > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
>> > +SIGNEDNESS_3 char *restrict a,
>> > +   SIGNEDNESS_4 char *restrict b)
>> > +{
>> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>> > +    {
>> > +      int av = a[i];
>> > +      int bv = b[i];
>> > +      SIGNEDNESS_2 short mult = av * bv;
>> > +      res += mult;
>> > +    }
>> > +  return res;
>> > +}
>> > +
>> > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res,
>> > +SIGNEDNESS_3 char *restrict b,
>> > +   SIGNEDNESS_4 char *restrict a)
>> > +{
>> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>> > +    {
>> > +      int av = a[i];
>> > +      int bv = b[i];
>> > +      SIGNEDNESS_2 short mult = av * bv;
>> > +      res += mult;
>> > +    }
>> > +  return res;
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
>>
>> Guess this is personal preference, but I don't think the SIGNEDNESS_*
>> macros add anything when used like this.  I remember doing something
>> similar in the past when including .c files from other .c files(!) in order to
>> avoid cut-&-paste, but there doesn't seem much benefit for standalone files
>> like these.
>
> If it's the same to you, I do prefer this version, since it's identical to the mid-end tests,
> It does allow when familiar with the  tests to just quickly see what it's testing.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md (aarch64_usdot<vsi2qi>): Rename to...
>         (usdot_prod<vsi2qi>): ... This.
>         * config/aarch64/aarch64-simd-builtins.def (usdot): Rename to...
>         (usdot_prod): ...This.
>         * config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Likewise.
>         * config/aarch64/aarch64-sve.md (@aarch64_<sur>dot_prod<vsi2qi>):
>         Rename to...
>         (@<sur>dot_prod<vsi2qi>): ...This.
>         * config/aarch64/aarch64-sve-builtins-base.cc
>         (svusdot_impl::expand): Use it.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/simd/vusdot-autovec.c: New test.
>         * gcc.target/aarch64/sve/vusdot-autovec.c: New test.

OK, thanks.

Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4edee99051c4e2112b546becca47da32aae21df2..c9fb8e702732dd311fb10de17126432e2a63a32b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -648,6 +648,22 @@  (define_expand "<sur>dot_prod<vsi2qi>"
   DONE;
 })
 
+;; Auto-vectorizer pattern for usdot
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+			    (match_operand:<VSI2QI> 2 "register_operand")]
+		 UNSPEC_USDOT)
+		(match_operand:VS 3 "register_operand")))]
+  "TARGET_I8MM"
+{
+  emit_insn (
+    gen_aarch64_usdot<vsi2qi> (operands[3], operands[3], operands[1],
+			       operands[2]));
+  emit_move_insn (operands[0], operands[3]);
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index dfdf0e2fd186389cbddcff51ef52f8778d7fdb24..50adcd5404e97e610485140fdbfe4c8ebbf2f602 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2366,7 +2366,7 @@  public:
        Hence we do the same rotation on arguments as svdot_impl does.  */
     e.rotate_inputs_left (0, 3);
     machine_mode mode = e.vector_mode (0);
-    insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode);
+    insn_code icode = code_for_dot_prod (UNSPEC_USDOT, mode);
     return e.use_exact_insn (icode);
   }
 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..1278f6f12fadf8eec693cd47fd545ff3277f08f1 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6870,7 +6870,7 @@  (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@aarch64_<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
         (plus:VNx4SI_ONLY
 	  (unspec:VNx4SI_ONLY
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
new file mode 100644
index 0000000000000000000000000000000000000000..b99a945903c043c7410becaf6f09496dd038410d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
+   SIGNEDNESS_4 char *restrict a)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
new file mode 100644
index 0000000000000000000000000000000000000000..094dd51cea62e0ba05ec3505657bf05320e5fdbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+i8mm+sve" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
+   SIGNEDNESS_4 char *restrict a)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {\tusdot\t} 2 } } */