diff mbox series

[rs6000] Fix PR81833 (incorrect code gen for vec_msum)

Message ID 3b753f75-ada4-6f84-a497-8fce45187ae9@linux.vnet.ibm.com
State New
Headers show
Series [rs6000] Fix PR81833 (incorrect code gen for vec_msum) | expand

Commit Message

Bill Schmidt Aug. 28, 2017, 8:56 p.m. UTC
Hi, 

PR81833 identifies a problem with the little-endian vector multiply-sum
instructions.  The original implementation is quite poor (and I am allowed
to say that, since it was mine).  This patch fixes the code properly.

The revised code still uses UNSPECs for these ops, which is not strictly
necessary, although descriptive rtl for them would be pretty complex.  I've
put in a FIXME to make note of that for a future cleanup.

Bootstrapped and tested on powerpc64le-linux-gnu with no regressions.  I am
currently testing on powerpc64-linux-gnu for 32- and 64-bit.  Provided that
testing succeeds, is this ok for trunk, and for eventual backport to all
supported releases?

Thanks,
Bill


[gcc]

2017-08-28  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	PR target/81833
	* config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
	define_insn to a define_expand.
	(altivec_vsum2sws_direct): New define_insn.
	(altivec_vsumsws): Convert from a define_insn to a define_expand.

[gcc/testsuite]

2017-08-28  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	PR target/81833
	* gcc.target/powerpc/pr81833.c: New file.

Comments

Bill Schmidt Aug. 29, 2017, 12:38 a.m. UTC | #1
On Aug 28, 2017, at 3:56 PM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
> 
> Hi, 
> 
> PR81833 identifies a problem with the little-endian vector multiply-sum
> instructions.  The original implementation is quite poor (and I am allowed
> to say that, since it was mine).  This patch fixes the code properly.
> 
> The revised code still uses UNSPECs for these ops, which is not strictly
> necessary, although descriptive rtl for them would be pretty complex.  I've
> put in a FIXME to make note of that for a future cleanup.
> 
> Bootstrapped and tested on powerpc64le-linux-gnu with no regressions.  I am
> currently testing on powerpc64-linux-gnu for 32- and 64-bit.  Provided that
> testing succeeds, is this ok for trunk, and for eventual backport to all
> supported releases?

FYI, big-endian tests have completed successfully.

Bill
> 
> Thanks,
> Bill
> 
> 
> [gcc]
> 
> 2017-08-28  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	PR target/81833
> 	* config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
> 	define_insn to a define_expand.
> 	(altivec_vsum2sws_direct): New define_insn.
> 	(altivec_vsumsws): Convert from a define_insn to a define_expand.
> 
> [gcc/testsuite]
> 
> 2017-08-28  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	PR target/81833
> 	* gcc.target/powerpc/pr81833.c: New file.
> 
> 
> Index: gcc/config/rs6000/altivec.md
> ===================================================================
> --- gcc/config/rs6000/altivec.md	(revision 251369)
> +++ gcc/config/rs6000/altivec.md	(working copy)
> @@ -1804,51 +1804,61 @@
>   "vsum4s<VI_char>s %0,%1,%2"
>   [(set_attr "type" "veccomplex")])
> 
> -;; FIXME: For the following two patterns, the scratch should only be
> -;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should
> -;; be emitted separately.
> -(define_insn "altivec_vsum2sws"
> -  [(set (match_operand:V4SI 0 "register_operand" "=v")
> -        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> -                      (match_operand:V4SI 2 "register_operand" "v")]
> -		     UNSPEC_VSUM2SWS))
> -   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
> -   (clobber (match_scratch:V4SI 3 "=v"))]
> +(define_expand "altivec_vsum2sws"
> +  [(use (match_operand:V4SI 0 "register_operand"))
> +   (use (match_operand:V4SI 1 "register_operand"))
> +   (use (match_operand:V4SI 2 "register_operand"))]
>   "TARGET_ALTIVEC"
> {
>   if (VECTOR_ELT_ORDER_BIG)
> -    return "vsum2sws %0,%1,%2";
> +    emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1],
> +                                            operands[2]));
>   else
> -    return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4";
> -}
> -  [(set_attr "type" "veccomplex")
> -   (set (attr "length")
> -     (if_then_else
> -       (match_test "VECTOR_ELT_ORDER_BIG")
> -       (const_string "4")
> -       (const_string "12")))])
> +    {
> +      rtx tmp1 = gen_reg_rtx (V4SImode);
> +      rtx tmp2 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2],
> +                                          operands[2], GEN_INT (12)));
> +      emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1));
> +      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
> +                                          GEN_INT (4)));
> +    }
> +  DONE;
> +})
> 
> -(define_insn "altivec_vsumsws"
> +; FIXME: This can probably be expressed without an UNSPEC.
> +(define_insn "altivec_vsum2sws_direct"
>   [(set (match_operand:V4SI 0 "register_operand" "=v")
>         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> -                      (match_operand:V4SI 2 "register_operand" "v")]
> -		     UNSPEC_VSUMSWS))
> -   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
> -   (clobber (match_scratch:V4SI 3 "=v"))]
> +	              (match_operand:V4SI 2 "register_operand" "v")]
> +		     UNSPEC_VSUM2SWS))]
>   "TARGET_ALTIVEC"
> +  "vsum2sws %0,%1,%2"
> +  [(set_attr "type" "veccomplex")
> +   (set_attr "length" "4")])
> +
> +(define_expand "altivec_vsumsws"
> +  [(use (match_operand:V4SI 0 "register_operand"))
> +   (use (match_operand:V4SI 1 "register_operand"))
> +   (use (match_operand:V4SI 2 "register_operand"))]
> +  "TARGET_ALTIVEC"
> {
>   if (VECTOR_ELT_ORDER_BIG)
> -    return "vsumsws %0,%1,%2";
> +    emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1],
> +                                           operands[2]));
>   else
> -    return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12";
> -}
> -  [(set_attr "type" "veccomplex")
> -   (set (attr "length")
> -     (if_then_else
> -       (match_test "(VECTOR_ELT_ORDER_BIG)")
> -       (const_string "4")
> -       (const_string "12")))])
> +    {
> +      rtx tmp1 = gen_reg_rtx (V4SImode);
> +      rtx tmp2 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx));
> +      emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1));
> +      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
> +                                          GEN_INT (12)));
> +    }
> +  DONE;
> +})
> 
> +; FIXME: This can probably be expressed without an UNSPEC.
> (define_insn "altivec_vsumsws_direct"
>   [(set (match_operand:V4SI 0 "register_operand" "=v")
>         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> Index: gcc/testsuite/gcc.target/powerpc/pr81833.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/pr81833.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/powerpc/pr81833.c	(working copy)
> @@ -0,0 +1,54 @@
> +/* PR81833: This used to fail due to improper implementation of vec_msum.  */
> +
> +/* { dg-do run {target { lp64 } } } */
> +/* { dg-require-effective-target powerpc_altivec_ok } */
> +
> +#include <altivec.h>
> +
> +#define vec_u8  vector unsigned char
> +#define vec_s8  vector signed char
> +#define vec_u16 vector unsigned short
> +#define vec_s16 vector signed short
> +#define vec_u32 vector unsigned int
> +#define vec_s32 vector signed int
> +#define vec_f   vector float
> +
> +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
> +
> +#define zero_u8v  (vec_u8)  zerov
> +#define zero_s8v  (vec_s8)  zerov
> +#define zero_u16v (vec_u16) zerov
> +#define zero_s16v (vec_s16) zerov
> +#define zero_u32v (vec_u32) zerov
> +#define zero_s32v (vec_s32) zerov
> +
> +signed int __attribute__((noinline))
> +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
> +			 int order)
> +{
> +  int i;
> +  LOAD_ZERO;
> +  register vec_s16 vec1;
> +  register vec_s32 res = vec_splat_s32 (0), t;
> +  signed int ires;
> +
> +  for (i = 0; i < order; i += 8) {
> +    vec1 = vec_vsx_ld (0, v1);
> +    t    = vec_msum (vec1, vec_ld (0, v2), zero_s32v);
> +    res  = vec_sums (t, res);
> +    v1  += 8;
> +    v2  += 8;
> +  }
> +  res = vec_splat (res, 3);
> +  vec_ste (res, 0, &ires);
> +
> +  return ires;
> +}
> +
> +int main(void)
> +{
> +  const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
> +  if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
> +    __builtin_abort ();
> +  return 0;
> +}
>
Segher Boessenkool Aug. 29, 2017, 12:02 p.m. UTC | #2
Hi Bill,

On Mon, Aug 28, 2017 at 03:56:21PM -0500, Bill Schmidt wrote:
> PR81833 identifies a problem with the little-endian vector multiply-sum
> instructions.  The original implementation is quite poor (and I am allowed
> to say that, since it was mine).  This patch fixes the code properly.
> 
> The revised code still uses UNSPECs for these ops, which is not strictly
> necessary, although descriptive rtl for them would be pretty complex.  I've
> put in a FIXME to make note of that for a future cleanup.

There is ss_plus, but that is the saturated sum of two things, not of
five resp. three as we need for vsumsws and vsum2sws.

If you convert to double-width, then add, then clamp, the result will
be correct; but then, very often some of that will be folded away (say,
when combine works on these patterns), and then you need a lot of
different patterns to catch all of this.

So we really need an ss_plus that has more than two arguments.  We could
have an unspec for that as well (instead of the unspecs that work on
vectors, unspecs that work on the elements), that may work.

Or perhaps it will work best as-is: the only way we get these unspecs
is via intrinsics, maybe we should just trust the user.

> 	PR target/81833
> 	* config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
> 	define_insn to a define_expand.
> 	(altivec_vsum2sws_direct): New define_insn.
> 	(altivec_vsumsws): Convert from a define_insn to a define_expand.
> 
> [gcc/testsuite]
> 
> 2017-08-28  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	PR target/81833
> 	* gcc.target/powerpc/pr81833.c: New file.


> +; FIXME: This can probably be expressed without an UNSPEC.
> +(define_insn "altivec_vsum2sws_direct"
>    [(set (match_operand:V4SI 0 "register_operand" "=v")
>          (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> +	              (match_operand:V4SI 2 "register_operand" "v")]
> +		     UNSPEC_VSUM2SWS))]
>    "TARGET_ALTIVEC"
> +  "vsum2sws %0,%1,%2"
> +  [(set_attr "type" "veccomplex")
> +   (set_attr "length" "4")])

This no longer writes to VSCR, please fix that.  "length" 4 is the
default, just leave it.

vsumsws has both of these correct already.

> --- gcc/testsuite/gcc.target/powerpc/pr81833.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/powerpc/pr81833.c	(working copy)
> @@ -0,0 +1,54 @@
> +/* PR81833: This used to fail due to improper implementation of vec_msum.  */
> +
> +/* { dg-do run {target { lp64 } } } */

Does this need lp64?  I don't at first glance see anything that would
need it.  Oh, vec_vsx_ld perhaps?

Okay for trunk and branches with the above fixes.  Thanks!


Segher
diff mbox series

Patch

Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md	(revision 251369)
+++ gcc/config/rs6000/altivec.md	(working copy)
@@ -1804,51 +1804,61 @@ 
   "vsum4s<VI_char>s %0,%1,%2"
   [(set_attr "type" "veccomplex")])
 
-;; FIXME: For the following two patterns, the scratch should only be
-;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should
-;; be emitted separately.
-(define_insn "altivec_vsum2sws"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
-                      (match_operand:V4SI 2 "register_operand" "v")]
-		     UNSPEC_VSUM2SWS))
-   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
-   (clobber (match_scratch:V4SI 3 "=v"))]
+(define_expand "altivec_vsum2sws"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V4SI 1 "register_operand"))
+   (use (match_operand:V4SI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
   if (VECTOR_ELT_ORDER_BIG)
-    return "vsum2sws %0,%1,%2";
+    emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1],
+                                            operands[2]));
   else
-    return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4";
-}
-  [(set_attr "type" "veccomplex")
-   (set (attr "length")
-     (if_then_else
-       (match_test "VECTOR_ELT_ORDER_BIG")
-       (const_string "4")
-       (const_string "12")))])
+    {
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2],
+                                          operands[2], GEN_INT (12)));
+      emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1));
+      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
+                                          GEN_INT (4)));
+    }
+  DONE;
+})
 
-(define_insn "altivec_vsumsws"
+; FIXME: This can probably be expressed without an UNSPEC.
+(define_insn "altivec_vsum2sws_direct"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
-                      (match_operand:V4SI 2 "register_operand" "v")]
-		     UNSPEC_VSUMSWS))
-   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
-   (clobber (match_scratch:V4SI 3 "=v"))]
+	              (match_operand:V4SI 2 "register_operand" "v")]
+		     UNSPEC_VSUM2SWS))]
   "TARGET_ALTIVEC"
+  "vsum2sws %0,%1,%2"
+  [(set_attr "type" "veccomplex")
+   (set_attr "length" "4")])
+
+(define_expand "altivec_vsumsws"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V4SI 1 "register_operand"))
+   (use (match_operand:V4SI 2 "register_operand"))]
+  "TARGET_ALTIVEC"
 {
   if (VECTOR_ELT_ORDER_BIG)
-    return "vsumsws %0,%1,%2";
+    emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1],
+                                           operands[2]));
   else
-    return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12";
-}
-  [(set_attr "type" "veccomplex")
-   (set (attr "length")
-     (if_then_else
-       (match_test "(VECTOR_ELT_ORDER_BIG)")
-       (const_string "4")
-       (const_string "12")))])
+    {
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx));
+      emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1));
+      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
+                                          GEN_INT (12)));
+    }
+  DONE;
+})
 
+; FIXME: This can probably be expressed without an UNSPEC.
 (define_insn "altivec_vsumsws_direct"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
Index: gcc/testsuite/gcc.target/powerpc/pr81833.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr81833.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr81833.c	(working copy)
@@ -0,0 +1,54 @@ 
+/* PR81833: This used to fail due to improper implementation of vec_msum.  */
+
+/* { dg-do run {target { lp64 } } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+
+#include <altivec.h>
+
+#define vec_u8  vector unsigned char
+#define vec_s8  vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
+#define vec_f   vector float
+
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
+
+#define zero_u8v  (vec_u8)  zerov
+#define zero_s8v  (vec_s8)  zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
+
+signed int __attribute__((noinline))
+scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
+			 int order)
+{
+  int i;
+  LOAD_ZERO;
+  register vec_s16 vec1;
+  register vec_s32 res = vec_splat_s32 (0), t;
+  signed int ires;
+
+  for (i = 0; i < order; i += 8) {
+    vec1 = vec_vsx_ld (0, v1);
+    t    = vec_msum (vec1, vec_ld (0, v2), zero_s32v);
+    res  = vec_sums (t, res);
+    v1  += 8;
+    v2  += 8;
+  }
+  res = vec_splat (res, 3);
+  vec_ste (res, 0, &ires);
+
+  return ires;
+}
+
+int main(void)
+{
+  const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+  if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
+    __builtin_abort ();
+  return 0;
+}