Message ID | patch-14499-tamar@arm.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Correct dot-product auto-vect optab RTL | expand |
Tamar Christina <tamar.christina@arm.com> writes: > Hi All, > > The current RTL for the vectorizer patterns for dot-product are incorrect. > Operand3 isn't an output parameter so we can't write to it. > > This fixes this issue and reduces the number of RTL. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? And backport to GCC 11, 10, 9? Yeah, OK for both master and backports, thanks. Richard > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd-builtins.def (udot, sdot): Rename to... > (sdot_prod, udot_prod): ...These. > * config/aarch64/aarch64-simd.md (<sur>dot_prod<vsi2qi>): Remove. > (</vsi2qi>(aarch64_<sur>dot<vsi2qi>): Rename to... > (<sur>dot_prod<vsi2qi>): ...This. > * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): > Update builtins. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def > index c869ed9a6ab7d63f0e3d5fe393a93c1cc9142e78..fa3bb7b96710122957933b5c0b0b276256892a4c 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -362,8 +362,8 @@ > BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) > > /* Implemented by <sur><dotprod>_prod<dot_mode>. */ > - BUILTIN_VB (TERNOP, sdot, 0, NONE) > - BUILTIN_VB (TERNOPU, udot, 0, NONE) > + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) > + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) > BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) > /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ > BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index 253ddbe25d3a86af4b40b056132e6a86a0392ea6..638e2d103bcba0af2292b16efd02046d1195095b 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -587,8 +587,28 @@ (define_expand "cmul<conj_op><mode>3" > DONE; > }) > > -;; These instructions map to the __builtins for the Dot Product operations. > -(define_insn "aarch64_<sur>dot<vsi2qi>" > +;; These expands map to the Dot Product optab the vectorizer checks for > +;; and to the intrinsics patttern. > +;; The auto-vectorizer expects a dot product builtin that also does an > +;; accumulation into the provided register. > +;; Given the following pattern > +;; > +;; for (i=0; i<len; i++) { > +;; c = a[i] * b[i]; > +;; r += c; > +;; } > +;; return result; > +;; > +;; This can be auto-vectorized to > +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; > +;; > +;; given enough iterations. However the vectorizer can keep unrolling the loop > +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; > +;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; > +;; ... > +;; > +;; and so the vectorizer provides r, in which the result has to be accumulated. > +(define_insn "<sur>dot_prod<vsi2qi>" > [(set (match_operand:VS 0 "register_operand" "=w") > (plus:VS (match_operand:VS 1 "register_operand" "0") > (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") > @@ -613,41 +633,6 @@ (define_insn "usdot_prod<vsi2qi>" > [(set_attr "type" "neon_dot<q>")] > ) > > -;; These expands map to the Dot Product optab the vectorizer checks for. > -;; The auto-vectorizer expects a dot product builtin that also does an > -;; accumulation into the provided register. > -;; Given the following pattern > -;; > -;; for (i=0; i<len; i++) { > -;; c = a[i] * b[i]; > -;; r += c; > -;; } > -;; return result; > -;; > -;; This can be auto-vectorized to > -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; > -;; > -;; given enough iterations. However the vectorizer can keep unrolling the loop > -;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; > -;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; > -;; ... > -;; > -;; and so the vectorizer provides r, in which the result has to be accumulated. > -(define_expand "<sur>dot_prod<vsi2qi>" > - [(set (match_operand:VS 0 "register_operand") > - (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand") > - (match_operand:<VSI2QI> 2 "register_operand")] > - DOTPROD) > - (match_operand:VS 3 "register_operand")))] > - "TARGET_DOTPROD" > -{ > - emit_insn ( > - gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1], > - operands[2])); > - emit_insn (gen_rtx_SET (operands[0], operands[3])); > - DONE; > -}) > - > ;; These instructions map to the __builtins for the Dot Product > ;; indexed operations. > (define_insn "aarch64_<sur>dot_lane<vsi2qi>" > @@ -944,8 +929,7 @@ (define_expand "<sur>sadv16qi" > rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); > rtx abd = gen_reg_rtx (V16QImode); > emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2])); > - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], > - abd, ones)); > + emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones)); > DONE; > } > rtx reduc = gen_reg_rtx (V8HImode); > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index 373f06a24ea6ce686d7e0cdf53dd364041c61092..90770411f177f05b4f1bdbd83890734612c31dc3 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -32112,28 +32112,28 @@ __extension__ extern __inline uint32x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) > { > - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); > + return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b); > } > > __extension__ extern __inline uint32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) > { > - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); > + return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b); > } > > __extension__ extern __inline int32x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) > { > - return __builtin_aarch64_sdotv8qi (__r, __a, __b); > + return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b); > } > > __extension__ extern __inline int32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) > { > - return __builtin_aarch64_sdotv16qi (__r, __a, __b); > + return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b); > } > > __extension__ extern __inline uint32x2_t
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index c869ed9a6ab7d63f0e3d5fe393a93c1cc9142e78..fa3bb7b96710122957933b5c0b0b276256892a4c 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -362,8 +362,8 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by <sur><dotprod>_prod<dot_mode>. */ - BUILTIN_VB (TERNOP, sdot, 0, NONE) - BUILTIN_VB (TERNOPU, udot, 0, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE) /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 253ddbe25d3a86af4b40b056132e6a86a0392ea6..638e2d103bcba0af2292b16efd02046d1195095b 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -587,8 +587,28 @@ (define_expand "cmul<conj_op><mode>3" DONE; }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "aarch64_<sur>dot<vsi2qi>" +;; These expands map to the Dot Product optab the vectorizer checks for +;; and to the intrinsics patttern. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; i<len; i++) { +;; c = a[i] * b[i]; +;; r += c; +;; } +;; return result; +;; +;; This can be auto-vectorized to +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; +;; +;; given enough iterations. However the vectorizer can keep unrolling the loop +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; +;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; +;; ... +;; +;; and so the vectorizer provides r, in which the result has to be accumulated. +(define_insn "<sur>dot_prod<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (match_operand:VS 1 "register_operand" "0") (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") @@ -613,41 +633,6 @@ (define_insn "usdot_prod<vsi2qi>" [(set_attr "type" "neon_dot<q>")] ) -;; These expands map to the Dot Product optab the vectorizer checks for. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; i<len; i++) { -;; c = a[i] * b[i]; -;; r += c; -;; } -;; return result; -;; -;; This can be auto-vectorized to -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough iterations. However the vectorizer can keep unrolling the loop -;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; -;; ... -;; -;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_expand "<sur>dot_prod<vsi2qi>" - [(set (match_operand:VS 0 "register_operand") - (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand") - (match_operand:<VSI2QI> 2 "register_operand")] - DOTPROD) - (match_operand:VS 3 "register_operand")))] - "TARGET_DOTPROD" -{ - emit_insn ( - gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) - ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_<sur>dot_lane<vsi2qi>" @@ -944,8 +929,7 @@ (define_expand "<sur>sadv16qi" rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3], - abd, ones)); + emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones)); DONE; } rtx reduc = gen_reg_rtx (V8HImode); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 373f06a24ea6ce686d7e0cdf53dd364041c61092..90770411f177f05b4f1bdbd83890734612c31dc3 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -32112,28 +32112,28 @@ __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b); } __extension__ extern __inline uint32x2_t