Message ID | patch-14661-tamar@arm.com |
---|---|
State | New |
Headers | show |
Series | [AArch32] : correct dot-product RTL patterns. | expand |
ping > -----Original Message----- > From: Tamar Christina > Sent: Tuesday, December 21, 2021 12:31 PM > To: gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com> > Subject: [AArch32]: correct dot-product RTL patterns. > > Hi All, > > The previous fix for this problem was wrong due to a subtle difference > between where NEON expects the RMW values and where intrinsics expects > them. > > The insn pattern is modeled after the intrinsics and so needs an expand for > the vectorizer optab to switch the RTL. > > However operand[3] is not expected to be written to so the current pattern > is bogus. > > Instead we use the expand to shuffle around the RTL. > > The vectorizer expects operands[3] and operands[0] to be the same but the > aarch64 intrinsics expanders expect operands[0] and operands[1] to be the > same. > > This also fixes some issues with big-endian, each dot product performs 4 8- > byte multiplications. However compared to AArch64 we don't enter lanes in > GCC lane indexed in AArch32 aside from loads/stores. This means no lane > remappings are done in arm-builtins.c and so none should be done at the > instruction side. > > There are some other instructions that need inspections as I think there are > more incorrect ones. > > Third there was a bug in the ACLE specication for dot product which has now > been fixed[1]. This means some intrinsics were missing and are added by > this patch. > > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. > > Ok for master? and active branches after some stew? > > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32, > vdot_laneq_s32, vdotq_laneq_s32): New. > * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq: New. > * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New. > (<sup>dot_prod<vsi2qi>): Re-order rtl. > (neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes. > (neon_<sup>dot_laneq<vsi2qi>): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/arm/simd/vdot-compile.c: Add new cases. > * gcc.target/arm/simd/vdot-exec.c: Likewise. > > --- inline copy of patch -- > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index > 3364b37f69dfc33082388246c03149d9ad66a634..af6ac63dc3b47830d92f199d93 > 153ff510f658e9 100644 > --- a/gcc/config/arm/arm_neon.h > +++ b/gcc/config/arm/arm_neon.h > @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, > int8x8_t __b, const int __index) > return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index); } > > +__extension__ extern __inline uint32x2_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const > +int __index) { > + return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline uint32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, > + const int __index) > +{ > + return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int > +__index) { > + return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); } > + > +__extension__ extern __inline int32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int > +__index) { > + return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); } > + > #pragma GCC pop_options > #endif > > diff --git a/gcc/config/arm/arm_neon_builtins.def > b/gcc/config/arm/arm_neon_builtins.def > index > fafb5c6fc51c16679ead1afda7cccfea8264fd15..f83dd4327c16c0af68f72eb6d9ca > 8cf21e2e56b5 100644 > --- a/gcc/config/arm/arm_neon_builtins.def > +++ b/gcc/config/arm/arm_neon_builtins.def > @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi) > VAR2 (UTERNOP, udot, v8qi, v16qi) > VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) > VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) > +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) > +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) > > VAR1 (USTERNOP, usdot, v8qi) > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index > 8b0a396947cc8e7345f178b926128d7224fb218a..848166311b5f82c5facb66e97c > 2260a5aba5d302 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -2866,20 +2866,49 @@ (define_expand "cmul<conj_op><mode>3" > }) > > > -;; These instructions map to the __builtins for the Dot Product operations. > -(define_insn "neon_<sup>dot<vsi2qi>" > +;; These map to the auto-vectorizer Dot Product optab. > +;; The auto-vectorizer expects a dot product builtin that also does an > +;; accumulation into the provided register. > +;; Given the following pattern > +;; > +;; for (i=0; i<len; i++) { > +;; c = a[i] * b[i]; > +;; r += c; > +;; } > +;; return result; > +;; > +;; This can be auto-vectorized to > +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; ;; ;; given > +enough iterations. However the vectorizer can keep unrolling the loop > +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; ;; r += > +a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; ;; ... > +;; > +;; and so the vectorizer provides r, in which the result has to be accumulated. > +(define_insn "<sup>dot_prod<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand" "=w") > - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") > - (unspec:VCVTI [(match_operand:<VSI2QI> 2 > - "register_operand" > "w") > - (match_operand:<VSI2QI> 3 > - "register_operand" > "w")] > - DOTPROD)))] > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" > "w") > + (match_operand:<VSI2QI> 2 "register_operand" > "w")] > + DOTPROD) > + (match_operand:VCVTI 3 "register_operand" "0")))] > "TARGET_DOTPROD" > - "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3" > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" > [(set_attr "type" "neon_dot<q>")] > ) > > +;; These instructions map to the __builtins for the Dot Product > +operations (define_expand "neon_<sup>dot<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand") > + (match_operand:<VSI2QI> 3 "register_operand")] > + DOTPROD) > + (match_operand:VCVTI 1 "register_operand")))] > + "TARGET_DOTPROD" > +) > + > ;; These instructions map to the __builtins for the Dot Product operations. > (define_insn "neon_usdot<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17 > +2927,40 @@ (define_insn "neon_usdot<vsi2qi>" > ;; indexed operations. > (define_insn "neon_<sup>dot_lane<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand" "=w") > - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") > - (unspec:VCVTI [(match_operand:<VSI2QI> 2 > - "register_operand" > "w") > - (match_operand:V8QI 3 "register_operand" > "t") > - (match_operand:SI 4 "immediate_operand" > "i")] > - DOTPROD)))] > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > "w") > + (match_operand:V8QI 3 "register_operand" "t") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD) > + (match_operand:VCVTI 1 "register_operand" "0")))] > + "TARGET_DOTPROD" > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > + [(set_attr "type" "neon_dot<q>")] > +) > + > +;; These instructions map to the __builtins for the Dot Product ;; > +indexed operations. > +(define_insn "neon_<sup>dot_laneq<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > "w") > + (match_operand:V16QI 3 "register_operand" "t") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD) > + (match_operand:VCVTI 1 "register_operand" "0")))] > "TARGET_DOTPROD" > { > - operands[4] > - = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL > (operands[4]))); > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > + int lane = INTVAL (operands[4]); > + if (lane > GET_MODE_NUNITS (V2SImode) - 1) > + { > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; > + } > + else > + { > + operands[4] = GEN_INT (lane); > + return > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; > + } > } > [(set_attr "type" "neon_dot<q>")] > ) > @@ -2932,43 +2984,6 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" > [(set_attr "type" "neon_dot<q>")] > ) > > -;; These expands map to the Dot Product optab the vectorizer checks for. > -;; The auto-vectorizer expects a dot product builtin that also does an -;; > accumulation into the provided register. > -;; Given the following pattern > -;; > -;; for (i=0; i<len; i++) { > -;; c = a[i] * b[i]; > -;; r += c; > -;; } > -;; return result; > -;; > -;; This can be auto-vectorized to > -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough > iterations. However the vectorizer can keep unrolling the loop -;; r += > a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] + > a[10]*b[10] + a[11]*b[11]; -;; ... > -;; > -;; and so the vectorizer provides r, in which the result has to be accumulated. > -(define_expand "<sup>dot_prod<vsi2qi>" > - [(set (match_operand:VCVTI 0 "register_operand") > - (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 > - "register_operand") > - (match_operand:<VSI2QI> 2 > - "register_operand")] > - DOTPROD) > - (match_operand:VCVTI 3 "register_operand")))] > - "TARGET_DOTPROD" > -{ > - emit_insn ( > - gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1], > - operands[2])); > - emit_insn (gen_rtx_SET (operands[0], operands[3])); > - DONE; > -}) > - > ;; Auto-vectorizer pattern for usdot > (define_expand "usdot_prod<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand") diff --git > a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > index > b3bd3bf00e3822fdd60b5955165583d5a5cdc1d0..d3541e829a44fa07972096a02 > 226adea1d26f09d 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t > y) > return vdotq_lane_s32 (r, x, y, 0); > } > > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0- > 9]+} 4 } } */ > +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) { > + return vdot_laneq_s32 (r, x, y, 0); > +} > + > +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) { > + return vdotq_laneq_s32 (r, x, y, 0); > +} > + > +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) { > + return vdot_laneq_s32 (r, x, y, 2); > +} > + > +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) { > + return vdotq_laneq_s32 (r, x, y, 2); > +} > + > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, > +d[0-9]+, d[0-9]+} 6 } } */ > /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0- > 9]+} 2 } } */ > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0- > 9]+\[#?[0-9]\]} 2 } } */ > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0- > 9]+\[#?[0-9]\]} 2 } } */ > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, > +d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, > +q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ > > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > index > 054f4703394b4184284dac371415bef8e9bac45d..97b7898bd6a0fc9a898eba0ea > 15fbf38eb1405a3 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > @@ -2,6 +2,7 @@ > /* { dg-additional-options "-O3" } */ > /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */ > /* { dg-add-options arm_v8_2a_dotprod_neon } */ > +/* { dg-additional-options "-w" } */ > > #include <arm_neon.h> > > @@ -33,7 +34,20 @@ extern void abort(); > t3 f##_##rx1 = {0}; \ > f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \ > if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ > - abort (); \ > + abort (); > + > +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2 > +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \ > + ARR(f, x, t1, r1); \ > + ARR(f, y, t2, r2); \ > + t3 f##_##rx = {0}; \ > + f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2)); \ > + if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \ > + abort (); \ > + t3 f##_##rx1 = {0}; \ > + f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3)); \ > + if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ > + abort (); > > int > main() > @@ -45,11 +59,16 @@ main() > TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24); > > TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), > 8, 16, 12, 24); > - > TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, > -16, -12, -24); > > TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), > P(2,3), 8, 16, 12, 24); > TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), > -8, -16, -12, -24); > > + TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, > + P2(1,2), P2(2,3), 8, 16, 12, 24); TEST_LANEQ (int8x8_t, int8x16_t, > + int32x2_t, vdot_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); > + > + TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, > + P2(1,2), P2(2,3), 8, 16, 12, 24); TEST_LANEQ (int8x16_t, int8x16_t, > + int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); > + > return 0; > } > > > --
Hi Tamar, Sorry for the delay. > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, January 11, 2022 7:10 AM > To: gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com> > Subject: RE: [AArch32]: correct dot-product RTL patterns. > > ping > > > -----Original Message----- > > From: Tamar Christina > > Sent: Tuesday, December 21, 2021 12:31 PM > > To: gcc-patches@gcc.gnu.org > > Cc: nd <nd@arm.com>; Ramana Radhakrishnan > > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw > > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov > > <Kyrylo.Tkachov@arm.com> > > Subject: [AArch32]: correct dot-product RTL patterns. > > > > Hi All, > > > > The previous fix for this problem was wrong due to a subtle difference > > between where NEON expects the RMW values and where intrinsics > expects > > them. > > > > The insn pattern is modeled after the intrinsics and so needs an expand for > > the vectorizer optab to switch the RTL. > > > > However operand[3] is not expected to be written to so the current pattern > > is bogus. > > > > Instead we use the expand to shuffle around the RTL. > > > > The vectorizer expects operands[3] and operands[0] to be the same but the > > aarch64 intrinsics expanders expect operands[0] and operands[1] to be the > > same. > > > > This also fixes some issues with big-endian, each dot product performs 4 8- > > byte multiplications. However compared to AArch64 we don't enter lanes > in > > GCC lane indexed in AArch32 aside from loads/stores. This means no lane > > remappings are done in arm-builtins.c and so none should be done at the > > instruction side. > > > > There are some other instructions that need inspections as I think there are > > more incorrect ones. > > > > Third there was a bug in the ACLE specication for dot product which has > now > > been fixed[1]. This means some intrinsics were missing and are added by > > this patch. > > > > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. > > > > Ok for master? and active branches after some stew? > > > > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32, > > vdot_laneq_s32, vdotq_laneq_s32): New. > > * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq: New. > > * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New. > > (<sup>dot_prod<vsi2qi>): Re-order rtl. > > (neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes. > > (neon_<sup>dot_laneq<vsi2qi>): New. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/arm/simd/vdot-compile.c: Add new cases. > > * gcc.target/arm/simd/vdot-exec.c: Likewise. > > Ok but... > > --- inline copy of patch -- > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > index > > > 3364b37f69dfc33082388246c03149d9ad66a634..af6ac63dc3b47830d92f199d > 93 > > 153ff510f658e9 100644 > > --- a/gcc/config/arm/arm_neon.h > > +++ b/gcc/config/arm/arm_neon.h > > @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t > __a, > > int8x8_t __b, const int __index) > > return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index); } > > > > +__extension__ extern __inline uint32x2_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const > > +int __index) { > > + return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index); > > +} > > + > > +__extension__ extern __inline uint32x4_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, > > + const int __index) > > +{ > > + return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index); > > +} > > + > > +__extension__ extern __inline int32x2_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int > > +__index) { > > + return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); } > > + > > +__extension__ extern __inline int32x4_t __attribute__ > > +((__always_inline__, __gnu_inline__, __artificial__)) > > +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int > > +__index) { > > + return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); } > > + > > #pragma GCC pop_options > > #endif > > > > diff --git a/gcc/config/arm/arm_neon_builtins.def > > b/gcc/config/arm/arm_neon_builtins.def > > index > > > fafb5c6fc51c16679ead1afda7cccfea8264fd15..f83dd4327c16c0af68f72eb6d9c > a > > 8cf21e2e56b5 100644 > > --- a/gcc/config/arm/arm_neon_builtins.def > > +++ b/gcc/config/arm/arm_neon_builtins.def > > @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi) > > VAR2 (UTERNOP, udot, v8qi, v16qi) > > VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) > > VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) > > +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) > > +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) > > > > VAR1 (USTERNOP, usdot, v8qi) > > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git > > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index > > > 8b0a396947cc8e7345f178b926128d7224fb218a..848166311b5f82c5facb66e9 > 7c > > 2260a5aba5d302 100644 > > --- a/gcc/config/arm/neon.md > > +++ b/gcc/config/arm/neon.md > > @@ -2866,20 +2866,49 @@ (define_expand "cmul<conj_op><mode>3" > > }) > > > > > > -;; These instructions map to the __builtins for the Dot Product operations. > > -(define_insn "neon_<sup>dot<vsi2qi>" > > +;; These map to the auto-vectorizer Dot Product optab. > > +;; The auto-vectorizer expects a dot product builtin that also does an > > +;; accumulation into the provided register. > > +;; Given the following pattern > > +;; > > +;; for (i=0; i<len; i++) { > > +;; c = a[i] * b[i]; > > +;; r += c; > > +;; } > > +;; return result; > > +;; > > +;; This can be auto-vectorized to > > +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; ;; ;; given > > +enough iterations. However the vectorizer can keep unrolling the loop > > +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; ;; r += > > +a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; ;; ... > > +;; > > +;; and so the vectorizer provides r, in which the result has to be > accumulated. > > +(define_insn "<sup>dot_prod<vsi2qi>" > > [(set (match_operand:VCVTI 0 "register_operand" "=w") > > - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") > > - (unspec:VCVTI [(match_operand:<VSI2QI> 2 > > - "register_operand" > > "w") > > - (match_operand:<VSI2QI> 3 > > - "register_operand" > > "w")] > > - DOTPROD)))] > > + (plus:VCVTI > > + (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" > > "w") > > + (match_operand:<VSI2QI> 2 "register_operand" > > "w")] > > + DOTPROD) > > + (match_operand:VCVTI 3 "register_operand" "0")))] > > "TARGET_DOTPROD" > > - "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3" > > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" > > [(set_attr "type" "neon_dot<q>")] > > ) > > > > +;; These instructions map to the __builtins for the Dot Product > > +operations (define_expand "neon_<sup>dot<vsi2qi>" > > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > > + (plus:VCVTI > > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand") > > + (match_operand:<VSI2QI> 3 "register_operand")] > > + DOTPROD) > > + (match_operand:VCVTI 1 "register_operand")))] > > + "TARGET_DOTPROD" > > +) > > + > > ;; These instructions map to the __builtins for the Dot Product operations. > > (define_insn "neon_usdot<vsi2qi>" > > [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17 > > +2927,40 @@ (define_insn "neon_usdot<vsi2qi>" > > ;; indexed operations. > > (define_insn "neon_<sup>dot_lane<vsi2qi>" > > [(set (match_operand:VCVTI 0 "register_operand" "=w") > > - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") > > - (unspec:VCVTI [(match_operand:<VSI2QI> 2 > > - "register_operand" > > "w") > > - (match_operand:V8QI 3 "register_operand" > > "t") > > - (match_operand:SI 4 "immediate_operand" > > "i")] > > - DOTPROD)))] > > + (plus:VCVTI > > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > > "w") > > + (match_operand:V8QI 3 "register_operand" "t") > > + (match_operand:SI 4 "immediate_operand" "i")] > > + DOTPROD) > > + (match_operand:VCVTI 1 "register_operand" "0")))] > > + "TARGET_DOTPROD" > > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > > + [(set_attr "type" "neon_dot<q>")] > > +) > > + > > +;; These instructions map to the __builtins for the Dot Product ;; > > +indexed operations. > > +(define_insn "neon_<sup>dot_laneq<vsi2qi>" > > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > > + (plus:VCVTI > > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > > "w") > > + (match_operand:V16QI 3 "register_operand" "t") > > + (match_operand:SI 4 "immediate_operand" "i")] > > + DOTPROD) > > + (match_operand:VCVTI 1 "register_operand" "0")))] > > "TARGET_DOTPROD" > > { > > - operands[4] > > - = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL > > (operands[4]))); > > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > > + int lane = INTVAL (operands[4]); > > + if (lane > GET_MODE_NUNITS (V2SImode) - 1) > > + { > > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); > > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; > > + } > > + else > > + { > > + operands[4] = GEN_INT (lane); > > + return > > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; > > + } > > } > > [(set_attr "type" "neon_dot<q>")] > > ) > > @@ -2932,43 +2984,6 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" > > [(set_attr "type" "neon_dot<q>")] > > ) > > > > -;; These expands map to the Dot Product optab the vectorizer checks for. > > -;; The auto-vectorizer expects a dot product builtin that also does an -;; > > accumulation into the provided register. > > -;; Given the following pattern > > -;; > > -;; for (i=0; i<len; i++) { > > -;; c = a[i] * b[i]; > > -;; r += c; > > -;; } > > -;; return result; > > -;; > > -;; This can be auto-vectorized to > > -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough > > iterations. However the vectorizer can keep unrolling the loop -;; r += > > a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] + > > a[10]*b[10] + a[11]*b[11]; -;; ... > > -;; > > -;; and so the vectorizer provides r, in which the result has to be > accumulated. > > -(define_expand "<sup>dot_prod<vsi2qi>" > > - [(set (match_operand:VCVTI 0 "register_operand") > > - (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 > > - "register_operand") > > - (match_operand:<VSI2QI> 2 > > - "register_operand")] > > - DOTPROD) > > - (match_operand:VCVTI 3 "register_operand")))] > > - "TARGET_DOTPROD" > > -{ > > - emit_insn ( > > - gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1], > > - operands[2])); > > - emit_insn (gen_rtx_SET (operands[0], operands[3])); > > - DONE; > > -}) > > - > > ;; Auto-vectorizer pattern for usdot > > (define_expand "usdot_prod<vsi2qi>" > > [(set (match_operand:VCVTI 0 "register_operand") diff --git > > a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > > b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > > index > > > b3bd3bf00e3822fdd60b5955165583d5a5cdc1d0..d3541e829a44fa07972096a > 02 > > 226adea1d26f09d 100644 > > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > > @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, > int8x8_t > > y) > > return vdotq_lane_s32 (r, x, y, 0); > > } > > > > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0- > > 9]+} 4 } } */ > > +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) { > > + return vdot_laneq_s32 (r, x, y, 0); > > +} > > + > > +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) { > > + return vdotq_laneq_s32 (r, x, y, 0); > > +} > > + > > +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) { > > + return vdot_laneq_s32 (r, x, y, 2); > > +} > > + > > +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) { > > + return vdotq_laneq_s32 (r, x, y, 2); > > +} > > + > > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, > > +d[0-9]+, d[0-9]+} 6 } } */ > > /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0- > > 9]+} 2 } } */ > > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0- > > 9]+\[#?[0-9]\]} 2 } } */ > > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0- > > 9]+\[#?[0-9]\]} 2 } } */ > > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, > > +d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ > > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, > > +q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ > > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > > b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > > index > > > 054f4703394b4184284dac371415bef8e9bac45d..97b7898bd6a0fc9a898eba0 > ea > > 15fbf38eb1405a3 100644 > > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > > @@ -2,6 +2,7 @@ > > /* { dg-additional-options "-O3" } */ > > /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */ > > /* { dg-add-options arm_v8_2a_dotprod_neon } */ > > +/* { dg-additional-options "-w" } */ ... Why is "-w" needed here? Can the test be adjusted to not generate unnecessary warnings instead? Thanks, Kyrill > > > > #include <arm_neon.h> > > > > @@ -33,7 +34,20 @@ extern void abort(); > > t3 f##_##rx1 = {0}; \ > > f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \ > > if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ > > - abort (); \ > > + abort (); > > + > > +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2 > > +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \ > > + ARR(f, x, t1, r1); \ > > + ARR(f, y, t2, r2); \ > > + t3 f##_##rx = {0}; \ > > + f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2)); \ > > + if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \ > > + abort (); \ > > + t3 f##_##rx1 = {0}; \ > > + f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3)); \ > > + if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ > > + abort (); > > > > int > > main() > > @@ -45,11 +59,16 @@ main() > > TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24); > > > > TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), > P(2,3), > > 8, 16, 12, 24); > > - > > TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), - > 8, > > -16, -12, -24); > > > > TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), > > P(2,3), 8, 16, 12, 24); > > TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,- > 3), > > -8, -16, -12, -24); > > > > + TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, > > + P2(1,2), P2(2,3), 8, 16, 12, 24); TEST_LANEQ (int8x8_t, int8x16_t, > > + int32x2_t, vdot_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); > > + > > + TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, > > + P2(1,2), P2(2,3), 8, 16, 12, 24); TEST_LANEQ (int8x16_t, int8x16_t, > > + int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); > > + > > return 0; > > } > > > > > > --
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 3364b37f69dfc33082388246c03149d9ad66a634..af6ac63dc3b47830d92f199d93153ff510f658e9 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index) return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index); } +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const int __index) +{ + return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index) +{ + return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) +{ + return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); +} + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index fafb5c6fc51c16679ead1afda7cccfea8264fd15..f83dd4327c16c0af68f72eb6d9ca8cf21e2e56b5 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi) VAR2 (UTERNOP, udot, v8qi, v16qi) VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) VAR1 (USTERNOP, usdot, v8qi) VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 8b0a396947cc8e7345f178b926128d7224fb218a..848166311b5f82c5facb66e97c2260a5aba5d302 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2866,20 +2866,49 @@ (define_expand "cmul<conj_op><mode>3" }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "neon_<sup>dot<vsi2qi>" +;; These map to the auto-vectorizer Dot Product optab. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; i<len; i++) { +;; c = a[i] * b[i]; +;; r += c; +;; } +;; return result; +;; +;; This can be auto-vectorized to +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; +;; +;; given enough iterations. However the vectorizer can keep unrolling the loop +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; +;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; +;; ... +;; +;; and so the vectorizer provides r, in which the result has to be accumulated. +(define_insn "<sup>dot_prod<vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") - (unspec:VCVTI [(match_operand:<VSI2QI> 2 - "register_operand" "w") - (match_operand:<VSI2QI> 3 - "register_operand" "w")] - DOTPROD)))] + (plus:VCVTI + (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" "w") + (match_operand:<VSI2QI> 2 "register_operand" "w")] + DOTPROD) + (match_operand:VCVTI 3 "register_operand" "0")))] "TARGET_DOTPROD" - "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3" + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" [(set_attr "type" "neon_dot<q>")] ) +;; These instructions map to the __builtins for the Dot Product operations +(define_expand "neon_<sup>dot<vsi2qi>" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand") + (match_operand:<VSI2QI> 3 "register_operand")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand")))] + "TARGET_DOTPROD" +) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_usdot<vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17 +2927,40 @@ (define_insn "neon_usdot<vsi2qi>" ;; indexed operations. (define_insn "neon_<sup>dot_lane<vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") - (unspec:VCVTI [(match_operand:<VSI2QI> 2 - "register_operand" "w") - (match_operand:V8QI 3 "register_operand" "t") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VCVTI + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand" "0")))] + "TARGET_DOTPROD" + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; + [(set_attr "type" "neon_dot<q>")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations. +(define_insn "neon_<sup>dot_laneq<vsi2qi>" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand" "0")))] "TARGET_DOTPROD" { - operands[4] - = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4]))); - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; + int lane = INTVAL (operands[4]); + if (lane > GET_MODE_NUNITS (V2SImode) - 1) + { + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; + } + else + { + operands[4] = GEN_INT (lane); + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; + } } [(set_attr "type" "neon_dot<q>")] ) @@ -2932,43 +2984,6 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" [(set_attr "type" "neon_dot<q>")] ) -;; These expands map to the Dot Product optab the vectorizer checks for. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; i<len; i++) { -;; c = a[i] * b[i]; -;; r += c; -;; } -;; return result; -;; -;; This can be auto-vectorized to -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough iterations. However the vectorizer can keep unrolling the loop -;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; -;; ... -;; -;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_expand "<sup>dot_prod<vsi2qi>" - [(set (match_operand:VCVTI 0 "register_operand") - (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 - "register_operand") - (match_operand:<VSI2QI> 2 - "register_operand")] - DOTPROD) - (match_operand:VCVTI 3 "register_operand")))] - "TARGET_DOTPROD" -{ - emit_insn ( - gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) - ;; Auto-vectorizer pattern for usdot (define_expand "usdot_prod<vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c index b3bd3bf00e3822fdd60b5955165583d5a5cdc1d0..d3541e829a44fa07972096a02226adea1d26f09d 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t y) return vdotq_lane_s32 (r, x, y, 0); } -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */ +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) +{ + return vdot_laneq_s32 (r, x, y, 0); +} + +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) +{ + return vdotq_laneq_s32 (r, x, y, 0); +} + +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) +{ + return vdot_laneq_s32 (r, x, y, 2); +} + +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) +{ + return vdotq_laneq_s32 (r, x, y, 2); +} + +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 6 } } */ /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */ -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */ +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c index 054f4703394b4184284dac371415bef8e9bac45d..97b7898bd6a0fc9a898eba0ea15fbf38eb1405a3 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c @@ -2,6 +2,7 @@ /* { dg-additional-options "-O3" } */ /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */ /* { dg-add-options arm_v8_2a_dotprod_neon } */ +/* { dg-additional-options "-w" } */ #include <arm_neon.h> @@ -33,7 +34,20 @@ extern void abort(); t3 f##_##rx1 = {0}; \ f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \ if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ - abort (); \ + abort (); + +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2 +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \ + ARR(f, x, t1, r1); \ + ARR(f, y, t2, r2); \ + t3 f##_##rx = {0}; \ + f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2)); \ + if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \ + abort (); \ + t3 f##_##rx1 = {0}; \ + f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3)); \ + if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ + abort (); int main() @@ -45,11 +59,16 @@ main() TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24); TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24); - TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24); TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24); TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24); + TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, P2(1,2), P2(2,3), 8, 16, 12, 24); + TEST_LANEQ (int8x8_t, int8x16_t, int32x2_t, vdot_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); + + TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, P2(1,2), P2(2,3), 8, 16, 12, 24); + TEST_LANEQ (int8x16_t, int8x16_t, int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); + return 0; }