Message ID | patch-15166-tamar@arm.com |
---|---|
State | New |
Headers | show |
Series | [AArch32] : correct usdot-product RTL patterns. | expand |
ping > -----Original Message----- > From: Tamar Christina > Sent: Tuesday, December 21, 2021 12:32 PM > To: gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com> > Subject: [PATCH][AArch32]: correct usdot-product RTL patterns. > > Hi All, > > There was a bug in the ACLE specication for dot product which has now been > fixed[1]. This means some intrinsics were missing and are added by this > patch. > > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. > > Ok for master? > > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32, > vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New > * config/arm/arm_neon_builtins.def (usdot): Add V16QI. > (usdot_laneq, sudot_laneq): New. > * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New. > (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code. > > gcc/testsuite/ChangeLog: > > * gcc.target/arm/simd/vdot-2-1.c: Add new tests. > * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output. > > --- inline copy of patch -- > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index > af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f > 7d6a63bab9f5aa 100644 > --- a/gcc/config/arm/arm_neon.h > +++ b/gcc/config/arm/arm_neon.h > @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, > int8x8_t __b) > return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); } > > +__extension__ extern __inline int32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { > + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); } > + > __extension__ extern __inline int32x2_t __attribute__ > ((__always_inline__, __gnu_inline__, __artificial__)) > vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38 @@ > vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, > return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index); } > > +__extension__ extern __inline int32x2_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, > + int8x16_t __b, const int __index) > +{ > + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, > + int8x16_t __b, const int __index) > +{ > + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, > +__index); } > + > +__extension__ extern __inline int32x2_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, > + uint8x16_t __b, const int __index) > +{ > + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, > + uint8x16_t __b, const int __index) { > + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, > +__index); } > + > #pragma GCC pop_options > > #pragma GCC pop_options > diff --git a/gcc/config/arm/arm_neon_builtins.def > b/gcc/config/arm/arm_neon_builtins.def > index > f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d > 160a7d6f595f057 100644 > --- a/gcc/config/arm/arm_neon_builtins.def > +++ b/gcc/config/arm/arm_neon_builtins.def > @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) > VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) > VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) > > -VAR1 (USTERNOP, usdot, v8qi) > +VAR2 (USTERNOP, usdot, v8qi, v16qi) > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) > VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi) > +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi) > +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi) > > VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf) > VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index > 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44 > db5e33405bb5fa1 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" > DOTPROD_I8MM) > (match_operand:VCVTI 1 "register_operand" "0")))] > "TARGET_I8MM" > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]" > + [(set_attr "type" "neon_dot<q>")] > +) > + > +;; These instructions map to the __builtins for the Dot Product ;; > +indexed operations in the v8.6 I8MM extension. > +(define_insn "neon_<sup>dot_laneq<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > "w") > + (match_operand:V16QI 3 "register_operand" "t") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD_I8MM) > + (match_operand:VCVTI 1 "register_operand" "0")))] > + "TARGET_I8MM" > { > - operands[4] = GEN_INT (INTVAL (operands[4])); > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > + int lane = INTVAL (operands[4]); > + if (lane > GET_MODE_NUNITS (V2SImode) - 1) > + { > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; > + } > + else > + { > + operands[4] = GEN_INT (lane); > + return > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; > + } > } > [(set_attr "type" "neon_dot<q>")] > ) > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > index > 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e > 238b7403b4f135 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > @@ -2,7 +2,7 @@ > /* { dg-require-effective-target arm_hard_ok } */ > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > /* { dg-add-options arm_v8_2a_i8mm } */ > -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */ > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" > +} */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include <arm_neon.h> > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) > return vusdot_s32 (r, x, y); > } > > +/* > +**usfooq: > +** ... > +** vusdot\.s8 q0, q1, q2 > +** bx lr > +*/ > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) { > + return vusdotq_s32 (r, x, y); > +} > + > /* > **usfoo_lane: > ** ... > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > uint8x8_t y) > return vsudotq_lane_s32 (r, x, y, 1); } > > +/* > +**usfoo_laneq: > +** ... > +** vusdot\.s8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) { > + return vusdot_laneq_s32 (r, x, y, 2); } > + > +/* > +**usfooq_laneq: > +** ... > +** vusdot\.s8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) { > + return vusdotq_laneq_s32 (r, x, y, 3); } > + > +/* Signed-Unsigned Dot Product instructions. */ > + > +/* > +**sfoo_laneq: > +** ... > +** vsudot\.u8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) { > + return vsudot_laneq_s32 (r, x, y, 2); } > + > +/* > +**sfooq_laneq: > +** ... > +** vsudot\.u8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) { > + return vsudotq_laneq_s32 (r, x, y, 3); } > + > /* > **usfoo_untied: > ** ... > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > index > 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0 > f081f80381b05c 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > @@ -2,7 +2,7 @@ > /* { dg-require-effective-target arm_hard_ok } */ > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > /* { dg-add-options arm_v8_2a_i8mm } */ > -/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } > */ > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard > +-mbig-endian -mfpu=auto" } */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include <arm_neon.h> > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) > return vusdot_s32 (r, x, y); > } > > +/* > +**usfooq: > +** ... > +** vusdot\.s8 q0, q1, q2 > +** bx lr > +*/ > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) { > + return vusdotq_s32 (r, x, y); > +} > + > /* > **usfoo_lane: > ** ... > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > uint8x8_t y) > return vsudotq_lane_s32 (r, x, y, 1); } > > +/* > +**usfoo_laneq: > +** ... > +** vusdot\.s8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) { > + return vusdot_laneq_s32 (r, x, y, 2); } > + > +/* > +**usfooq_laneq: > +** ... > +** vusdot\.s8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) { > + return vusdotq_laneq_s32 (r, x, y, 3); } > + > +/* Signed-Unsigned Dot Product instructions. */ > + > +/* > +**sfoo_laneq: > +** ... > +** vsudot\.u8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) { > + return vsudot_laneq_s32 (r, x, y, 2); } > + > +/* > +**sfooq_laneq: > +** ... > +** vsudot\.u8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) { > + return vsudotq_laneq_s32 (r, x, y, 3); } > + > /* > **usfoo_untied: > ** ... > @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, > int32x2_t r, uint8x8_t x, int8x8_ { > return vusdot_lane_s32 (r, x, y, 0); > } > + > > > --
Ping x3 > -----Original Message----- > From: Tamar Christina > Sent: Tuesday, January 11, 2022 7:10 AM > To: gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com> > Subject: RE: [PATCH][AArch32]: correct usdot-product RTL patterns. > > ping > > > -----Original Message----- > > From: Tamar Christina > > Sent: Tuesday, December 21, 2021 12:32 PM > > To: gcc-patches@gcc.gnu.org > > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > > <Kyrylo.Tkachov@arm.com> > > Subject: [PATCH][AArch32]: correct usdot-product RTL patterns. > > > > Hi All, > > > > There was a bug in the ACLE specication for dot product which has now > > been fixed[1]. This means some intrinsics were missing and are added > > by this patch. > > > > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. > > > > Ok for master? > > > > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32, > > vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New > > * config/arm/arm_neon_builtins.def (usdot): Add V16QI. > > (usdot_laneq, sudot_laneq): New. > > * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New. > > (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/arm/simd/vdot-2-1.c: Add new tests. > > * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output. > > > > --- inline copy of patch -- > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > > index > > > af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f > > 7d6a63bab9f5aa 100644 > > --- a/gcc/config/arm/arm_neon.h > > +++ b/gcc/config/arm/arm_neon.h > > @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, > > int8x8_t __b) > > return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); } > > > > +__extension__ extern __inline int32x4_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { > > + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); } > > + > > __extension__ extern __inline int32x2_t __attribute__ > > ((__always_inline__, __gnu_inline__, __artificial__)) > > vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38 > > @@ > > vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, > > return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, > > __index); } > > > > +__extension__ extern __inline int32x2_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, > > + int8x16_t __b, const int __index) { > > + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, > > +__index); } > > + > > +__extension__ extern __inline int32x4_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, > > + int8x16_t __b, const int __index) { > > + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, > > +__index); } > > + > > +__extension__ extern __inline int32x2_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, > > + uint8x16_t __b, const int __index) { > > + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, > > +__index); } > > + > > +__extension__ extern __inline int32x4_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, > > + uint8x16_t __b, const int __index) { > > + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, > > +__index); } > > + > > #pragma GCC pop_options > > > > #pragma GCC pop_options > > diff --git a/gcc/config/arm/arm_neon_builtins.def > > b/gcc/config/arm/arm_neon_builtins.def > > index > > > f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d > > 160a7d6f595f057 100644 > > --- a/gcc/config/arm/arm_neon_builtins.def > > +++ b/gcc/config/arm/arm_neon_builtins.def > > @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) > > VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) > > VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) > > > > -VAR1 (USTERNOP, usdot, v8qi) > > +VAR2 (USTERNOP, usdot, v8qi, v16qi) > > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) > > VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi) > > +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi) > > +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi) > > > > VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf) > > VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git > > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index > > > 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44 > > db5e33405bb5fa1 100644 > > --- a/gcc/config/arm/neon.md > > +++ b/gcc/config/arm/neon.md > > @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" > > DOTPROD_I8MM) > > (match_operand:VCVTI 1 "register_operand" "0")))] > > "TARGET_I8MM" > > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]" > > + [(set_attr "type" "neon_dot<q>")] > > +) > > + > > +;; These instructions map to the __builtins for the Dot Product ;; > > +indexed operations in the v8.6 I8MM extension. > > +(define_insn "neon_<sup>dot_laneq<vsi2qi>" > > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > > + (plus:VCVTI > > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > > "w") > > + (match_operand:V16QI 3 "register_operand" "t") > > + (match_operand:SI 4 "immediate_operand" "i")] > > + DOTPROD_I8MM) > > + (match_operand:VCVTI 1 "register_operand" "0")))] > > + "TARGET_I8MM" > > { > > - operands[4] = GEN_INT (INTVAL (operands[4])); > > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > > + int lane = INTVAL (operands[4]); > > + if (lane > GET_MODE_NUNITS (V2SImode) - 1) > > + { > > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); > > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; > > + } > > + else > > + { > > + operands[4] = GEN_INT (lane); > > + return > > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; > > + } > > } > > [(set_attr "type" "neon_dot<q>")] > > ) > > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > > index > > > 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e > > 238b7403b4f135 100644 > > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > > @@ -2,7 +2,7 @@ > > /* { dg-require-effective-target arm_hard_ok } */ > > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > > /* { dg-add-options arm_v8_2a_i8mm } */ > > -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */ > > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" > > +} */ > > /* { dg-final { check-function-bodies "**" "" } } */ > > > > #include <arm_neon.h> > > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) > > return vusdot_s32 (r, x, y); > > } > > > > +/* > > +**usfooq: > > +** ... > > +** vusdot\.s8 q0, q1, q2 > > +** bx lr > > +*/ > > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) { > > + return vusdotq_s32 (r, x, y); > > +} > > + > > /* > > **usfoo_lane: > > ** ... > > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > > uint8x8_t y) > > return vsudotq_lane_s32 (r, x, y, 1); } > > > > +/* > > +**usfoo_laneq: > > +** ... > > +** vusdot\.s8 d0, d1, d3\[0\] > > +** bx lr > > +*/ > > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) { > > + return vusdot_laneq_s32 (r, x, y, 2); } > > + > > +/* > > +**usfooq_laneq: > > +** ... > > +** vusdot\.s8 q0, q1, d5\[1\] > > +** bx lr > > +*/ > > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) { > > + return vusdotq_laneq_s32 (r, x, y, 3); } > > + > > +/* Signed-Unsigned Dot Product instructions. */ > > + > > +/* > > +**sfoo_laneq: > > +** ... > > +** vsudot\.u8 d0, d1, d3\[0\] > > +** bx lr > > +*/ > > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) { > > + return vsudot_laneq_s32 (r, x, y, 2); } > > + > > +/* > > +**sfooq_laneq: > > +** ... > > +** vsudot\.u8 q0, q1, d5\[1\] > > +** bx lr > > +*/ > > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) { > > + return vsudotq_laneq_s32 (r, x, y, 3); } > > + > > /* > > **usfoo_untied: > > ** ... > > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > > index > > > 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0 > > f081f80381b05c 100644 > > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > > @@ -2,7 +2,7 @@ > > /* { dg-require-effective-target arm_hard_ok } */ > > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > > /* { dg-add-options arm_v8_2a_i8mm } */ > > -/* { dg-additional-options "-O -save-temps -mbig-endian > > -mfloat-abi=hard" } */ > > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard > > +-mbig-endian -mfpu=auto" } */ > > /* { dg-final { check-function-bodies "**" "" } } */ > > > > #include <arm_neon.h> > > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) > > return vusdot_s32 (r, x, y); > > } > > > > +/* > > +**usfooq: > > +** ... > > +** vusdot\.s8 q0, q1, q2 > > +** bx lr > > +*/ > > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) { > > + return vusdotq_s32 (r, x, y); > > +} > > + > > /* > > **usfoo_lane: > > ** ... > > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > > uint8x8_t y) > > return vsudotq_lane_s32 (r, x, y, 1); } > > > > +/* > > +**usfoo_laneq: > > +** ... > > +** vusdot\.s8 d0, d1, d3\[0\] > > +** bx lr > > +*/ > > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) { > > + return vusdot_laneq_s32 (r, x, y, 2); } > > + > > +/* > > +**usfooq_laneq: > > +** ... > > +** vusdot\.s8 q0, q1, d5\[1\] > > +** bx lr > > +*/ > > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) { > > + return vusdotq_laneq_s32 (r, x, y, 3); } > > + > > +/* Signed-Unsigned Dot Product instructions. */ > > + > > +/* > > +**sfoo_laneq: > > +** ... > > +** vsudot\.u8 d0, d1, d3\[0\] > > +** bx lr > > +*/ > > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) { > > + return vsudot_laneq_s32 (r, x, y, 2); } > > + > > +/* > > +**sfooq_laneq: > > +** ... > > +** vsudot\.u8 q0, q1, d5\[1\] > > +** bx lr > > +*/ > > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) { > > + return vsudotq_laneq_s32 (r, x, y, 3); } > > + > > /* > > **usfoo_untied: > > ** ... > > @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, > > int32x2_t r, uint8x8_t x, int8x8_ { > > return vusdot_lane_s32 (r, x, y, 0); } > > + > > > > > > --
> -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, December 21, 2021 12:32 PM > To: gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com> > Subject: [PATCH][AArch32]: correct usdot-product RTL patterns. > > Hi All, > > There was a bug in the ACLE specication for dot product which has now > been fixed[1]. This means some intrinsics were missing and are added by > this > patch. > > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. > > Ok for master? Ok. Thanks, Kyrill > > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/arm/arm_neon.h (vusdotq_s32, vusdot_laneq_s32, > vusdotq_laneq_s32, vsudot_laneq_s32, vsudotq_laneq_s32): New > * config/arm/arm_neon_builtins.def (usdot): Add V16QI. > (usdot_laneq, sudot_laneq): New. > * config/arm/neon.md (neon_<sup>dot_laneq<vsi2qi>): New. > (neon_<sup>dot_lane<vsi2qi>): Remote unneeded code. > > gcc/testsuite/ChangeLog: > > * gcc.target/arm/simd/vdot-2-1.c: Add new tests. > * gcc.target/arm/simd/vdot-2-2.c: Likewise and fix output. > > --- inline copy of patch -- > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > index > af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc0 > 3f7d6a63bab9f5aa 100644 > --- a/gcc/config/arm/arm_neon.h > +++ b/gcc/config/arm/arm_neon.h > @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, > int8x8_t __b) > return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); > } > > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) > +{ > + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); > +} > + > __extension__ extern __inline int32x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, > @@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t > __a, > return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index); > } > > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, > + int8x16_t __b, const int __index) > +{ > + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, > + int8x16_t __b, const int __index) > +{ > + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, > + uint8x16_t __b, const int __index) > +{ > + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, > + uint8x16_t __b, const int __index) > +{ > + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index); > +} > + > #pragma GCC pop_options > > #pragma GCC pop_options > diff --git a/gcc/config/arm/arm_neon_builtins.def > b/gcc/config/arm/arm_neon_builtins.def > index > f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4 > d160a7d6f595f057 100644 > --- a/gcc/config/arm/arm_neon_builtins.def > +++ b/gcc/config/arm/arm_neon_builtins.def > @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) > VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) > VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) > > -VAR1 (USTERNOP, usdot, v8qi) > +VAR2 (USTERNOP, usdot, v8qi, v16qi) > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) > VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi) > +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi) > +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi) > > VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf) > VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index > 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db > 44db5e33405bb5fa1 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" > DOTPROD_I8MM) > (match_operand:VCVTI 1 "register_operand" "0")))] > "TARGET_I8MM" > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]" > + [(set_attr "type" "neon_dot<q>")] > +) > + > +;; These instructions map to the __builtins for the Dot Product > +;; indexed operations in the v8.6 I8MM extension. > +(define_insn "neon_<sup>dot_laneq<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > "w") > + (match_operand:V16QI 3 "register_operand" "t") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD_I8MM) > + (match_operand:VCVTI 1 "register_operand" "0")))] > + "TARGET_I8MM" > { > - operands[4] = GEN_INT (INTVAL (operands[4])); > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > + int lane = INTVAL (operands[4]); > + if (lane > GET_MODE_NUNITS (V2SImode) - 1) > + { > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; > + } > + else > + { > + operands[4] = GEN_INT (lane); > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; > + } > } > [(set_attr "type" "neon_dot<q>")] > ) > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > index > 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b4 > 3e238b7403b4f135 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c > @@ -2,7 +2,7 @@ > /* { dg-require-effective-target arm_hard_ok } */ > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > /* { dg-add-options arm_v8_2a_i8mm } */ > -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */ > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" } > */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include <arm_neon.h> > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) > return vusdot_s32 (r, x, y); > } > > +/* > +**usfooq: > +** ... > +** vusdot\.s8 q0, q1, q2 > +** bx lr > +*/ > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_s32 (r, x, y); > +} > + > /* > **usfoo_lane: > ** ... > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > uint8x8_t y) > return vsudotq_lane_s32 (r, x, y, 1); > } > > +/* > +**usfoo_laneq: > +** ... > +** vusdot\.s8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) > +{ > + return vusdot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**usfooq_laneq: > +** ... > +** vusdot\.s8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_laneq_s32 (r, x, y, 3); > +} > + > +/* Signed-Unsigned Dot Product instructions. */ > + > +/* > +**sfoo_laneq: > +** ... > +** vsudot\.u8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) > +{ > + return vsudot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**sfooq_laneq: > +** ... > +** vsudot\.u8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) > +{ > + return vsudotq_laneq_s32 (r, x, y, 3); > +} > + > /* > **usfoo_untied: > ** ... > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > index > 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ad > a0f081f80381b05c 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c > @@ -2,7 +2,7 @@ > /* { dg-require-effective-target arm_hard_ok } */ > /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > /* { dg-add-options arm_v8_2a_i8mm } */ > -/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } > */ > +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian - > mfpu=auto" } */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include <arm_neon.h> > @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) > return vusdot_s32 (r, x, y); > } > > +/* > +**usfooq: > +** ... > +** vusdot\.s8 q0, q1, q2 > +** bx lr > +*/ > +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_s32 (r, x, y); > +} > + > /* > **usfoo_lane: > ** ... > @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > uint8x8_t y) > return vsudotq_lane_s32 (r, x, y, 1); > } > > +/* > +**usfoo_laneq: > +** ... > +** vusdot\.s8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) > +{ > + return vusdot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**usfooq_laneq: > +** ... > +** vusdot\.s8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_laneq_s32 (r, x, y, 3); > +} > + > +/* Signed-Unsigned Dot Product instructions. */ > + > +/* > +**sfoo_laneq: > +** ... > +** vsudot\.u8 d0, d1, d3\[0\] > +** bx lr > +*/ > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) > +{ > + return vsudot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**sfooq_laneq: > +** ... > +** vsudot\.u8 q0, q1, d5\[1\] > +** bx lr > +*/ > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) > +{ > + return vsudotq_laneq_s32 (r, x, y, 3); > +} > + > /* > **usfoo_untied: > ** ... > @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, > int32x2_t r, uint8x8_t x, int8x8_ > { > return vusdot_lane_s32 (r, x, y, 0); > } > + > > > --
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index af6ac63dc3b47830d92f199d93153ff510f658e9..2255d600549a2a1e5dbcebc03f7d6a63bab9f5aa 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18930,6 +18930,13 @@ vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) return __builtin_neon_usdotv8qi_ssus (__r, __a, __b); } +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_neon_usdotv16qi_ssus (__r, __a, __b); +} + __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, @@ -18962,6 +18969,38 @@ vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, return __builtin_neon_sudot_lanev16qi_sssus (__r, __a, __b, __index); } +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, + int8x16_t __b, const int __index) +{ + return __builtin_neon_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, + int8x16_t __b, const int __index) +{ + return __builtin_neon_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, + uint8x16_t __b, const int __index) +{ + return __builtin_neon_sudot_laneqv8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, + uint8x16_t __b, const int __index) +{ + return __builtin_neon_sudot_laneqv16qi_sssus (__r, __a, __b, __index); +} + #pragma GCC pop_options #pragma GCC pop_options diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5..1c150ed3b650a003b44901b4d160a7d6f595f057 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -345,9 +345,11 @@ VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) -VAR1 (USTERNOP, usdot, v8qi) +VAR2 (USTERNOP, usdot, v8qi, v16qi) VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) VAR2 (SUMAC_LANE_QUADTUP, sudot_lane, v8qi, v16qi) +VAR2 (USMAC_LANE_QUADTUP, usdot_laneq, v8qi, v16qi) +VAR2 (SUMAC_LANE_QUADTUP, sudot_laneq, v8qi, v16qi) VAR4 (BINOP, vcadd90, v4hf, v2sf, v8hf, v4sf) VAR4 (BINOP, vcadd270, v4hf, v2sf, v8hf, v4sf) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 848166311b5f82c5facb66e97c2260a5aba5d302..1707d8e625079b83497a3db44db5e33405bb5fa1 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2977,9 +2977,33 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" DOTPROD_I8MM) (match_operand:VCVTI 1 "register_operand" "0")))] "TARGET_I8MM" + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]" + [(set_attr "type" "neon_dot<q>")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations in the v8.6 I8MM extension. +(define_insn "neon_<sup>dot_laneq<vsi2qi>" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD_I8MM) + (match_operand:VCVTI 1 "register_operand" "0")))] + "TARGET_I8MM" { - operands[4] = GEN_INT (INTVAL (operands[4])); - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; + int lane = INTVAL (operands[4]); + if (lane > GET_MODE_NUNITS (V2SImode) - 1) + { + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; + } + else + { + operands[4] = GEN_INT (lane); + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; + } } [(set_attr "type" "neon_dot<q>")] ) diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c index 88b80cff2329d9c502f40a31bbef70d26251c909..35d713f6a60d3d5880ddc8b43e238b7403b4f135 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-1.c @@ -2,7 +2,7 @@ /* { dg-require-effective-target arm_hard_ok } */ /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ /* { dg-add-options arm_v8_2a_i8mm } */ -/* { dg-additional-options "-O -save-temps -mfloat-abi=hard" } */ +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mfpu=auto" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include <arm_neon.h> @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) return vusdot_s32 (r, x, y); } +/* +**usfooq: +** ... +** vusdot\.s8 q0, q1, q2 +** bx lr +*/ +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_s32 (r, x, y); +} + /* **usfoo_lane: ** ... @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) return vsudotq_lane_s32 (r, x, y, 1); } +/* +**usfoo_laneq: +** ... +** vusdot\.s8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) +{ + return vusdot_laneq_s32 (r, x, y, 2); +} + +/* +**usfooq_laneq: +** ... +** vusdot\.s8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_laneq_s32 (r, x, y, 3); +} + +/* Signed-Unsigned Dot Product instructions. */ + +/* +**sfoo_laneq: +** ... +** vsudot\.u8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) +{ + return vsudot_laneq_s32 (r, x, y, 2); +} + +/* +**sfooq_laneq: +** ... +** vsudot\.u8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) +{ + return vsudotq_laneq_s32 (r, x, y, 3); +} + /* **usfoo_untied: ** ... diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c index 1c74718ca5644be05b4d4839c3a7ea40bff11e40..c57dd423dbc45b2f9f7890ada0f081f80381b05c 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-2-2.c @@ -2,7 +2,7 @@ /* { dg-require-effective-target arm_hard_ok } */ /* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ /* { dg-add-options arm_v8_2a_i8mm } */ -/* { dg-additional-options "-O -save-temps -mbig-endian -mfloat-abi=hard" } */ +/* { dg-additional-options "-O -save-temps -mfloat-abi=hard -mbig-endian -mfpu=auto" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include <arm_neon.h> @@ -20,6 +20,17 @@ int32x2_t usfoo (int32x2_t r, uint8x8_t x, int8x8_t y) return vusdot_s32 (r, x, y); } +/* +**usfooq: +** ... +** vusdot\.s8 q0, q1, q2 +** bx lr +*/ +int32x4_t usfooq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_s32 (r, x, y); +} + /* **usfoo_lane: ** ... @@ -66,6 +77,52 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) return vsudotq_lane_s32 (r, x, y, 1); } +/* +**usfoo_laneq: +** ... +** vusdot\.s8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t usfoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) +{ + return vusdot_laneq_s32 (r, x, y, 2); +} + +/* +**usfooq_laneq: +** ... +** vusdot\.s8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t usfooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) +{ + return vusdotq_laneq_s32 (r, x, y, 3); +} + +/* Signed-Unsigned Dot Product instructions. */ + +/* +**sfoo_laneq: +** ... +** vsudot\.u8 d0, d1, d3\[0\] +** bx lr +*/ +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) +{ + return vsudot_laneq_s32 (r, x, y, 2); +} + +/* +**sfooq_laneq: +** ... +** vsudot\.u8 q0, q1, d5\[1\] +** bx lr +*/ +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) +{ + return vsudotq_laneq_s32 (r, x, y, 3); +} + /* **usfoo_untied: ** ... @@ -89,3 +146,4 @@ int32x2_t usfoo_lane_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_ { return vusdot_lane_s32 (r, x, y, 0); } +