Message ID | 3b753f75-ada4-6f84-a497-8fce45187ae9@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
Series | [rs6000] Fix PR81833 (incorrect code gen for vec_msum) | expand |
On Aug 28, 2017, at 3:56 PM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote: > > Hi, > > PR81833 identifies a problem with the little-endian vector multiply-sum > instructions. The original implementation is quite poor (and I am allowed > to say that, since it was mine). This patch fixes the code properly. > > The revised code still uses UNSPECs for these ops, which is not strictly > necessary, although descriptive rtl for them would be pretty complex. I've > put in a FIXME to make note of that for a future cleanup. > > Bootstrapped and tested on powerpc64le-linux-gnu with no regressions. I am > currently testing on powerpc64-linux-gnu for 32- and 64-bit. Provided that > testing succeeds, is this ok for trunk, and for eventual backport to all > supported releases? FYI, big-endian tests have completed successfully. Bill > > Thanks, > Bill > > > [gcc] > > 2017-08-28 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > PR target/81833 > * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a > define_insn to a define_expand. > (altivec_vsum2sws_direct): New define_insn. > (altivec_vsumsws): Convert from a define_insn to a define_expand. > > [gcc/testsuite] > > 2017-08-28 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > PR target/81833 > * gcc.target/powerpc/pr81833.c: New file. > > > Index: gcc/config/rs6000/altivec.md > =================================================================== > --- gcc/config/rs6000/altivec.md (revision 251369) > +++ gcc/config/rs6000/altivec.md (working copy) > @@ -1804,51 +1804,61 @@ > "vsum4s<VI_char>s %0,%1,%2" > [(set_attr "type" "veccomplex")]) > > -;; FIXME: For the following two patterns, the scratch should only be > -;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should > -;; be emitted separately. > -(define_insn "altivec_vsum2sws" > - [(set (match_operand:V4SI 0 "register_operand" "=v") > - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > - (match_operand:V4SI 2 "register_operand" "v")] > - UNSPEC_VSUM2SWS)) > - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) > - (clobber (match_scratch:V4SI 3 "=v"))] > +(define_expand "altivec_vsum2sws" > + [(use (match_operand:V4SI 0 "register_operand")) > + (use (match_operand:V4SI 1 "register_operand")) > + (use (match_operand:V4SI 2 "register_operand"))] > "TARGET_ALTIVEC" > { > if (VECTOR_ELT_ORDER_BIG) > - return "vsum2sws %0,%1,%2"; > + emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1], > + operands[2])); > else > - return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4"; > -} > - [(set_attr "type" "veccomplex") > - (set (attr "length") > - (if_then_else > - (match_test "VECTOR_ELT_ORDER_BIG") > - (const_string "4") > - (const_string "12")))]) > + { > + rtx tmp1 = gen_reg_rtx (V4SImode); > + rtx tmp2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2], > + operands[2], GEN_INT (12))); > + emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1)); > + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, > + GEN_INT (4))); > + } > + DONE; > +}) > > -(define_insn "altivec_vsumsws" > +; FIXME: This can probably be expressed without an UNSPEC. > +(define_insn "altivec_vsum2sws_direct" > [(set (match_operand:V4SI 0 "register_operand" "=v") > (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > - (match_operand:V4SI 2 "register_operand" "v")] > - UNSPEC_VSUMSWS)) > - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) > - (clobber (match_scratch:V4SI 3 "=v"))] > + (match_operand:V4SI 2 "register_operand" "v")] > + UNSPEC_VSUM2SWS))] > "TARGET_ALTIVEC" > + "vsum2sws %0,%1,%2" > + [(set_attr "type" "veccomplex") > + (set_attr "length" "4")]) > + > +(define_expand "altivec_vsumsws" > + [(use (match_operand:V4SI 0 "register_operand")) > + (use (match_operand:V4SI 1 "register_operand")) > + (use (match_operand:V4SI 2 "register_operand"))] > + "TARGET_ALTIVEC" > { > if (VECTOR_ELT_ORDER_BIG) > - return "vsumsws %0,%1,%2"; > + emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1], > + operands[2])); > else > - return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12"; > -} > - [(set_attr "type" "veccomplex") > - (set (attr "length") > - (if_then_else > - (match_test "(VECTOR_ELT_ORDER_BIG)") > - (const_string "4") > - (const_string "12")))]) > + { > + rtx tmp1 = gen_reg_rtx (V4SImode); > + rtx tmp2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx)); > + emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1)); > + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, > + GEN_INT (12))); > + } > + DONE; > +}) > > +; FIXME: This can probably be expressed without an UNSPEC. > (define_insn "altivec_vsumsws_direct" > [(set (match_operand:V4SI 0 "register_operand" "=v") > (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > Index: gcc/testsuite/gcc.target/powerpc/pr81833.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/pr81833.c (nonexistent) > +++ gcc/testsuite/gcc.target/powerpc/pr81833.c (working copy) > @@ -0,0 +1,54 @@ > +/* PR81833: This used to fail due to improper implementation of vec_msum. */ > + > +/* { dg-do run {target { lp64 } } } */ > +/* { dg-require-effective-target powerpc_altivec_ok } */ > + > +#include <altivec.h> > + > +#define vec_u8 vector unsigned char > +#define vec_s8 vector signed char > +#define vec_u16 vector unsigned short > +#define vec_s16 vector signed short > +#define vec_u32 vector unsigned int > +#define vec_s32 vector signed int > +#define vec_f vector float > + > +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0) > + > +#define zero_u8v (vec_u8) zerov > +#define zero_s8v (vec_s8) zerov > +#define zero_u16v (vec_u16) zerov > +#define zero_s16v (vec_s16) zerov > +#define zero_u32v (vec_u32) zerov > +#define zero_s32v (vec_s32) zerov > + > +signed int __attribute__((noinline)) > +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, > + int order) > +{ > + int i; > + LOAD_ZERO; > + register vec_s16 vec1; > + register vec_s32 res = vec_splat_s32 (0), t; > + signed int ires; > + > + for (i = 0; i < order; i += 8) { > + vec1 = vec_vsx_ld (0, v1); > + t = vec_msum (vec1, vec_ld (0, v2), zero_s32v); > + res = vec_sums (t, res); > + v1 += 8; > + v2 += 8; > + } > + res = vec_splat (res, 3); > + vec_ste (res, 0, &ires); > + > + return ires; > +} > + > +int main(void) > +{ > + const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 }; > + if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8) > + __builtin_abort (); > + return 0; > +} >
Hi Bill, On Mon, Aug 28, 2017 at 03:56:21PM -0500, Bill Schmidt wrote: > PR81833 identifies a problem with the little-endian vector multiply-sum > instructions. The original implementation is quite poor (and I am allowed > to say that, since it was mine). This patch fixes the code properly. > > The revised code still uses UNSPECs for these ops, which is not strictly > necessary, although descriptive rtl for them would be pretty complex. I've > put in a FIXME to make note of that for a future cleanup. There is ss_plus, but that is the saturated sum of two things, not of five resp. three as we need for vsumsws and vsum2sws. If you convert to double-width, then add, then clamp, the result will be correct; but then, very often some of that will be folded away (say, when combine works on these patterns), and then you need a lot of different patterns to catch all of this. So we really need an ss_plus that has more than two arguments. We could have an unspec for that as well (instead of the unspecs that work on vectors, unspecs that work on the elements), that may work. Or perhaps it will work best as-is: the only way we get these unspecs is via intrinsics, maybe we should just trust the user. > PR target/81833 > * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a > define_insn to a define_expand. > (altivec_vsum2sws_direct): New define_insn. > (altivec_vsumsws): Convert from a define_insn to a define_expand. > > [gcc/testsuite] > > 2017-08-28 Bill Schmidt <wschmidt@linux.vnet.ibm.com> > > PR target/81833 > * gcc.target/powerpc/pr81833.c: New file. > +; FIXME: This can probably be expressed without an UNSPEC. > +(define_insn "altivec_vsum2sws_direct" > [(set (match_operand:V4SI 0 "register_operand" "=v") > (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > + (match_operand:V4SI 2 "register_operand" "v")] > + UNSPEC_VSUM2SWS))] > "TARGET_ALTIVEC" > + "vsum2sws %0,%1,%2" > + [(set_attr "type" "veccomplex") > + (set_attr "length" "4")]) This no longer writes to VSCR, please fix that. "length" 4 is the default, just leave it. vsumsws has both of these correct already. > --- gcc/testsuite/gcc.target/powerpc/pr81833.c (nonexistent) > +++ gcc/testsuite/gcc.target/powerpc/pr81833.c (working copy) > @@ -0,0 +1,54 @@ > +/* PR81833: This used to fail due to improper implementation of vec_msum. */ > + > +/* { dg-do run {target { lp64 } } } */ Does this need lp64? I don't at first glance see anything that would need it. Oh, vec_vsx_ld perhaps? Okay for trunk and branches with the above fixes. Thanks! Segher
Index: gcc/config/rs6000/altivec.md =================================================================== --- gcc/config/rs6000/altivec.md (revision 251369) +++ gcc/config/rs6000/altivec.md (working copy) @@ -1804,51 +1804,61 @@ "vsum4s<VI_char>s %0,%1,%2" [(set_attr "type" "veccomplex")]) -;; FIXME: For the following two patterns, the scratch should only be -;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should -;; be emitted separately. -(define_insn "altivec_vsum2sws" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") - (match_operand:V4SI 2 "register_operand" "v")] - UNSPEC_VSUM2SWS)) - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) - (clobber (match_scratch:V4SI 3 "=v"))] +(define_expand "altivec_vsum2sws" + [(use (match_operand:V4SI 0 "register_operand")) + (use (match_operand:V4SI 1 "register_operand")) + (use (match_operand:V4SI 2 "register_operand"))] "TARGET_ALTIVEC" { if (VECTOR_ELT_ORDER_BIG) - return "vsum2sws %0,%1,%2"; + emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1], + operands[2])); else - return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4"; -} - [(set_attr "type" "veccomplex") - (set (attr "length") - (if_then_else - (match_test "VECTOR_ELT_ORDER_BIG") - (const_string "4") - (const_string "12")))]) + { + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2], + operands[2], GEN_INT (12))); + emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1)); + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, + GEN_INT (4))); + } + DONE; +}) -(define_insn "altivec_vsumsws" +; FIXME: This can probably be expressed without an UNSPEC. +(define_insn "altivec_vsum2sws_direct" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") - (match_operand:V4SI 2 "register_operand" "v")] - UNSPEC_VSUMSWS)) - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) - (clobber (match_scratch:V4SI 3 "=v"))] + (match_operand:V4SI 2 "register_operand" "v")] + UNSPEC_VSUM2SWS))] "TARGET_ALTIVEC" + "vsum2sws %0,%1,%2" + [(set_attr "type" "veccomplex") + (set_attr "length" "4")]) + +(define_expand "altivec_vsumsws" + [(use (match_operand:V4SI 0 "register_operand")) + (use (match_operand:V4SI 1 "register_operand")) + (use (match_operand:V4SI 2 "register_operand"))] + "TARGET_ALTIVEC" { if (VECTOR_ELT_ORDER_BIG) - return "vsumsws %0,%1,%2"; + emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1], + operands[2])); else - return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12"; -} - [(set_attr "type" "veccomplex") - (set (attr "length") - (if_then_else - (match_test "(VECTOR_ELT_ORDER_BIG)") - (const_string "4") - (const_string "12")))]) + { + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx)); + emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1)); + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, + GEN_INT (12))); + } + DONE; +}) +; FIXME: This can probably be expressed without an UNSPEC. (define_insn "altivec_vsumsws_direct" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") Index: gcc/testsuite/gcc.target/powerpc/pr81833.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/pr81833.c (nonexistent) +++ gcc/testsuite/gcc.target/powerpc/pr81833.c (working copy) @@ -0,0 +1,54 @@ +/* PR81833: This used to fail due to improper implementation of vec_msum. */ + +/* { dg-do run {target { lp64 } } } */ +/* { dg-require-effective-target powerpc_altivec_ok } */ + +#include <altivec.h> + +#define vec_u8 vector unsigned char +#define vec_s8 vector signed char +#define vec_u16 vector unsigned short +#define vec_s16 vector signed short +#define vec_u32 vector unsigned int +#define vec_s32 vector signed int +#define vec_f vector float + +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0) + +#define zero_u8v (vec_u8) zerov +#define zero_s8v (vec_s8) zerov +#define zero_u16v (vec_u16) zerov +#define zero_s16v (vec_s16) zerov +#define zero_u32v (vec_u32) zerov +#define zero_s32v (vec_s32) zerov + +signed int __attribute__((noinline)) +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, + int order) +{ + int i; + LOAD_ZERO; + register vec_s16 vec1; + register vec_s32 res = vec_splat_s32 (0), t; + signed int ires; + + for (i = 0; i < order; i += 8) { + vec1 = vec_vsx_ld (0, v1); + t = vec_msum (vec1, vec_ld (0, v2), zero_s32v); + res = vec_sums (t, res); + v1 += 8; + v2 += 8; + } + res = vec_splat (res, 3); + vec_ste (res, 0, &ires); + + return ires; +} + +int main(void) +{ + const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 }; + if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8) + __builtin_abort (); + return 0; +}