diff mbox series

Adjust ix86_rtx_costs for pternlog_operand_p.

Message ID 20240613004440.335650-1-hongtao.liu@intel.com
State New
Headers show
Series Adjust ix86_rtx_costs for pternlog_operand_p. | expand

Commit Message

liuhongt June 13, 2024, 12:44 a.m. UTC
r15-1100-gec985bc97a0157 improves handling of ternlog instructions,
now GCC can recognize lots of pternlog_operand with different
variants.

The patch adjust rtx_costs for that, so pass_combine can
reasonably generate more optimal vpternlog instructions.

.i.e
for avx512f-vpternlog-3.c, with the patch, 2 vpternlog are combined into one.


<       vpternlogd      $168, %zmm1, %zmm0, %zmm2
<       vpternlogd      $0x55, %zmm2, %zmm2, %zmm2
>       vpternlogd      $87, %zmm1, %zmm0, %zmm2
<       vpand   %xmm0, %xmm1, %xmm0
<       vpternlogd      $0x55, %zmm0, %zmm0, %zmm0
>       vpternlogd      $63, %zmm1, %zmm0, %zmm1
>       vmovdqa %xmm1, %xmm0
<       vpternlogd      $188, %zmm2, %zmm0, %zmm1
<       vpternlogd      $0x55, %zmm1, %zmm1, %zmm1
>       vpternlogd      $37, %zmm0, %zmm2, %zmm1

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

	* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
	pternlog_operand under AVX512, also adjust VEC_DUPLICATE
	according since vec_dup:mem can't be that cheap.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-pr98461.c: Scan either notl or
	vpternlog.
	* gcc.target/i386/avx512f-pr96891-3.c: Also scan for inversed
	condition.
	* gcc.target/i386/avx512f-vpternlogd-3.c: Adjust vpternlog
	number to 673.
	* gcc.target/i386/avx512f-vpternlogd-4.c: Ditto.
	* gcc.target/i386/avx512f-vpternlogd-5.c: Ditto.
	* gcc.target/i386/sse2-v1ti-vne.c: Add -mno-avx512f.
---
 gcc/config/i386/i386.cc                       | 39 ++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/avx2-pr98461.c  |  2 +-
 .../gcc.target/i386/avx512f-pr96891-3.c       |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-3.c    |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-4.c    |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-5.c    |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c |  2 +-
 7 files changed, 44 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 173db213d14..9fb1ae575dd 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21571,6 +21571,31 @@  ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     = speed ? ix86_tune_cost : &ix86_size_cost;
   int src_cost;
 
+  /* Handling different vternlog variants.  */
+  if ((GET_MODE_SIZE (mode) == 64
+       ? (TARGET_AVX512F && TARGET_EVEX512)
+       : (TARGET_AVX512VL
+	  || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)))
+      && GET_MODE_SIZE (mode) >= 16
+      && outer_code_i == SET
+      && ternlog_operand (x, mode))
+    {
+      rtx args[3];
+
+      args[0] = NULL_RTX;
+      args[1] = NULL_RTX;
+      args[2] = NULL_RTX;
+      int idx = ix86_ternlog_idx (x, args);
+      gcc_assert (idx >= 0);
+
+      *total = cost->sse_op;
+      for (int i = 0; i != 3; i++)
+	if (args[i])
+	  *total += rtx_cost (args[i], GET_MODE (args[i]), UNSPEC, i, speed);
+      return true;
+    }
+
+
   switch (code)
     {
     case SET:
@@ -22233,6 +22258,9 @@  ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
       else if (XINT (x, 1) == UNSPEC_VTERNLOG)
 	{
 	  *total = cost->sse_op;
+	  *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed);
+	  *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed);
+	  *total += rtx_cost (XVECEXP (x, 0, 2), mode, code, 2, speed);
 	  return true;
 	}
       else if (XINT (x, 1) == UNSPEC_PTEST)
@@ -22260,12 +22288,21 @@  ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 
     case VEC_SELECT:
     case VEC_CONCAT:
-    case VEC_DUPLICATE:
       /* ??? Assume all of these vector manipulation patterns are
 	 recognizable.  In which case they all pretty much have the
 	 same cost.  */
      *total = cost->sse_op;
      return true;
+    case VEC_DUPLICATE:
+      *total = rtx_cost (XEXP (x, 0),
+			 GET_MODE (XEXP (x, 0)),
+			 VEC_DUPLICATE, 0, speed);
+      /* It's broadcast instruction, not embedded broadcasting.  */
+      if (outer_code == SET)
+	*total += cost->sse_op;
+
+     return true;
+
     case VEC_MERGE:
       mask = XEXP (x, 2);
       /* This is masked instruction, assume the same cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
index 15f49b864da..225f2ab00e5 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
@@ -2,7 +2,7 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx2 -masm=att" } */
 /* { dg-final { scan-assembler-times "\tvpmovmskb\t" 6 } } */
-/* { dg-final { scan-assembler-times "\tnotl\t" 6 } } */
+/* { dg-final { scan-assembler-times "\t(?:notl|vpternlog\[dq\])\t" 6 } } */
 /* { dg-final { scan-assembler-not "\tvpcmpeq" } } */
 /* { dg-final { scan-assembler-not "\tvpxor" } } */
 /* { dg-final { scan-assembler-not "\tvpandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
index 06db7521305..5b260818cb3 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
@@ -3,7 +3,7 @@ 
 /* { dg-final { scan-assembler-not {not[bwlqd]\]} } } */
 /* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$5} 4} } */
 /* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$6} 4} } */
-/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$7} 4} } */
+/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$[37]} 4} } */
 /* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$5} 2} } */
 /* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$6} 2} } */
 /* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$7} 2} } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c
index fc66a9f5572..9ed4680346b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c
@@ -952,4 +952,4 @@  V foo_254_3(V a, V b, V c) { return (c|b)|a; }
 
 V foo_255_1(V a, V b, V c) { return (V){~0,~0,~0,~0}; }
 
-/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 694 } } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 673 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c
index 14296508cac..eb39ffc2564 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c
@@ -952,4 +952,4 @@  V foo_254_3(V a, V b, V c) { return (c|b)|a; }
 
 V foo_255_1(V a, V b, V c) { return (V){~0,~0,~0,~0}; }
 
-/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 694 } } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 673 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c
index 3dbd9545283..85de5b02ce6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c
@@ -952,4 +952,4 @@  V foo_254_3(V a, V b, V c) { return (c|b)|a; }
 
 V foo_255_1(V a, V b, V c) { return (V){~0,~0,~0,~0}; }
 
-/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 679 } } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 673 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
index 767b0e4b3ac..2394cff39f2 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile { target int128 } } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-avx512f" } */
 typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
 typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
 typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));