diff mbox series

[1/8] i386: Auto vectorize sdot_prod, usdot_prod, udot_prod with AVX10.2 instructions

Message ID 20240826064238.2268967-2-haochen.jiang@intel.com
State New
Headers show
Series i386: Opmitize code with AVX10.2 new instructions | expand

Commit Message

Haochen Jiang Aug. 26, 2024, 6:42 a.m. UTC
gcc/ChangeLog:

	* config/i386/sse.md (VI1_AVX512VNNIBW): New.
	(VI2_AVX10_2): Ditto.
	(sdot_prod<mode>): Add AVX10.2
	to auto vectorize and combine 512 bit part.
	(udot_prod<mode>): Ditto.
	(sdot_prodv64qi): Removed.
	(udot_prodv64qi): Ditto.
	(usdot_prod<mode>): Add AVX10.2 to auto vectorize.
	(udot_prod<mode>): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/vnniint16-auto-vectorize-2.c: Only define
	TEST when not defined.
	* gcc.target/i386/vnniint8-auto-vectorize-2.c: Ditto.
	* gcc.target/i386/vnniint16-auto-vectorize-3.c: New test.
	* gcc.target/i386/vnniint16-auto-vectorize-4.c: Ditto.
	* gcc.target/i386/vnniint8-auto-vectorize-3.c: Ditto.
	* gcc.target/i386/vnniint8-auto-vectorize-4.c: Ditto.
---
 gcc/config/i386/sse.md                        | 93 +++++--------------
 .../i386/vnniint16-auto-vectorize-2.c         | 11 ++-
 .../i386/vnniint16-auto-vectorize-3.c         |  6 ++
 .../i386/vnniint16-auto-vectorize-4.c         | 15 +++
 .../i386/vnniint8-auto-vectorize-2.c          | 12 ++-
 .../i386/vnniint8-auto-vectorize-3.c          |  6 ++
 .../i386/vnniint8-auto-vectorize-4.c          | 15 +++
 7 files changed, 80 insertions(+), 78 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index da91d39cf8e..442ac93afa2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -610,6 +610,10 @@ 
 (define_mode_iterator VI1_AVX512VNNI
   [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
+(define_mode_iterator VI1_AVX512VNNIBW
+  [(V64QI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
+   (V32QI "TARGET_AVX2") V16QI])
+
 (define_mode_iterator VI12_256_512_AVX512VL
   [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL")
    (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL")])
@@ -627,6 +631,9 @@ 
   [(V32HI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
    (V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI2_AVX10_2
+  [(V32HI "TARGET_AVX10_2_512") V16HI V8HI])
+
 (define_mode_iterator VI4_AVX
   [(V8SI "TARGET_AVX") V4SI])
 
@@ -31232,12 +31239,13 @@ 
 
 (define_expand "sdot_prod<mode>"
   [(match_operand:<ssedvecmode> 0 "register_operand")
-   (match_operand:VI1_AVX2 1 "register_operand")
-   (match_operand:VI1_AVX2 2 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 2 "register_operand")
    (match_operand:<ssedvecmode> 3 "register_operand")]
   "TARGET_SSE2"
 {
-  if (TARGET_AVXVNNIINT8)
+  if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512)
+      || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
     {
       operands[1] = lowpart_subreg (<ssedvecmode>mode,
 				    force_reg (<MODE>mode, operands[1]),
@@ -31276,44 +31284,15 @@ 
   DONE;
 })
 
-(define_expand "sdot_prodv64qi"
-  [(match_operand:V16SI 0 "register_operand")
-   (match_operand:V64QI 1 "register_operand")
-   (match_operand:V64QI 2 "register_operand")
-   (match_operand:V16SI 3 "register_operand")]
-  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
-{
-  /* Emulate with vpdpwssd.  */
-  rtx op1_lo = gen_reg_rtx (V32HImode);
-  rtx op1_hi = gen_reg_rtx (V32HImode);
-  rtx op2_lo = gen_reg_rtx (V32HImode);
-  rtx op2_hi = gen_reg_rtx (V32HImode);
-
-  emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1]));
-  emit_insn (gen_vec_unpacks_lo_v64qi (op2_lo, operands[2]));
-  emit_insn (gen_vec_unpacks_hi_v64qi (op1_hi, operands[1]));
-  emit_insn (gen_vec_unpacks_hi_v64qi (op2_hi, operands[2]));
-
-  rtx res1 = gen_reg_rtx (V16SImode);
-  rtx res2 = gen_reg_rtx (V16SImode);
-  rtx sum = gen_reg_rtx (V16SImode);
-
-  emit_move_insn (sum, CONST0_RTX (V16SImode));
-  emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
-  emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
-
-  emit_insn (gen_addv16si3 (operands[0], res1, res2));
-  DONE;
-})
-
 (define_expand "udot_prod<mode>"
   [(match_operand:<ssedvecmode> 0 "register_operand")
-   (match_operand:VI1_AVX2 1 "register_operand")
-   (match_operand:VI1_AVX2 2 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI1_AVX512VNNIBW 2 "register_operand")
    (match_operand:<ssedvecmode> 3 "register_operand")]
   "TARGET_SSE2"
 {
-  if (TARGET_AVXVNNIINT8)
+  if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512)
+      || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
     {
       operands[1] = lowpart_subreg (<ssedvecmode>mode,
 				    force_reg (<MODE>mode, operands[1]),
@@ -31352,36 +31331,6 @@ 
   DONE;
 })
 
-(define_expand "udot_prodv64qi"
-  [(match_operand:V16SI 0 "register_operand")
-   (match_operand:V64QI 1 "register_operand")
-   (match_operand:V64QI 2 "register_operand")
-   (match_operand:V16SI 3 "register_operand")]
-  "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
-{
-  /* Emulate with vpdpwssd.  */
-  rtx op1_lo = gen_reg_rtx (V32HImode);
-  rtx op1_hi = gen_reg_rtx (V32HImode);
-  rtx op2_lo = gen_reg_rtx (V32HImode);
-  rtx op2_hi = gen_reg_rtx (V32HImode);
-
-  emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1]));
-  emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2]));
-  emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1]));
-  emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2]));
-
-  rtx res1 = gen_reg_rtx (V16SImode);
-  rtx res2 = gen_reg_rtx (V16SImode);
-  rtx sum = gen_reg_rtx (V16SImode);
-
-  emit_move_insn (sum, CONST0_RTX (V16SImode));
-  emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
-  emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
-
-  emit_insn (gen_addv16si3 (operands[0], res1, res2));
-  DONE;
-})
-
 (define_insn "vpdp<vpdotprodtype>_<mode>"
   [(set (match_operand:VI4_AVX 0 "register_operand" "=v")
 	(unspec:VI4_AVX
@@ -31757,10 +31706,10 @@ 
 
 (define_expand "usdot_prod<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")
-   (match_operand:VI2_AVX2 2 "register_operand")
+   (match_operand:VI2_AVX10_2 1 "register_operand")
+   (match_operand:VI2_AVX10_2 2 "register_operand")
    (match_operand:<sseunpackmode> 3 "register_operand")]
-  "TARGET_AVXVNNIINT16"
+  "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256"
 {
   operands[1] = lowpart_subreg (<sseunpackmode>mode,
                                 force_reg (<MODE>mode, operands[1]),
@@ -31775,10 +31724,10 @@ 
 
 (define_expand "udot_prod<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")
-   (match_operand:VI2_AVX2 2 "register_operand")
+   (match_operand:VI2_AVX10_2 1 "register_operand")
+   (match_operand:VI2_AVX10_2 2 "register_operand")
    (match_operand:<sseunpackmode> 3 "register_operand")]
-  "TARGET_AVXVNNIINT16"
+  "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256"
 {
   operands[1] = lowpart_subreg (<sseunpackmode>mode,
                                 force_reg (<MODE>mode, operands[1]),
diff --git a/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-2.c b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-2.c
index 90dc0eade7e..1bd1dfbd3a3 100644
--- a/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-2.c
+++ b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-2.c
@@ -2,19 +2,24 @@ 
 /* { dg-options "-O2 -mavxvnniint16" } */
 /* { dg-require-effective-target avxvnniint16 } */
 
+#ifndef AVX10_2
 #define AVXVNNIINT16
+#endif
+
 #ifndef CHECK
 #define CHECK "avx-check.h"
 #endif
 
+#include CHECK
+#include "vnniint16-auto-vectorize-1.c"
+
 #ifndef TEST
 #define TEST avx_test
 #endif
 
-#include CHECK
-#include "vnniint16-auto-vectorize-1.c"
-
+#ifndef N
 #define N 256
+#endif
 
 short a_i16[N];
 unsigned short b_u16[N], c_u16[N], d_u16[N];
diff --git a/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c
new file mode 100644
index 00000000000..85dd80e6d1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-3.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile } */                                     
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler "vpdpwusd\t" } } */
+/* { dg-final { scan-assembler "vpdpwuud\t" } } */
+
+#include "vnniint16-auto-vectorize-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c
new file mode 100644
index 00000000000..36b76987b50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vnniint16-auto-vectorize-4.c
@@ -0,0 +1,15 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx10.2-512" } */
+/* { dg-require-effective-target avx10_2_512 } */
+
+#ifndef CHECK
+#define CHECK "avx512f-check.h"
+#endif
+
+#define N 512
+
+#define AVX10_2
+#define AVX10_2_512
+#define AVX10_512BIT
+#define AVX512F_LEN 512
+#include "vnniint16-auto-vectorize-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-2.c b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-2.c
index 99853e6c3b7..5a791f0f59e 100644
--- a/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-2.c
+++ b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-2.c
@@ -2,19 +2,25 @@ 
 /* { dg-options "-O2 -mavxvnniint8" } */
 /* { dg-require-effective-target avxvnniint8 } */
 
+#ifndef AVX10_2
 #define AVXVNNIINT8
+#endif
+
 #ifndef CHECK
 #define CHECK "avx-check.h"
 #endif
 
+#include CHECK
+#include "vnniint8-auto-vectorize-1.c"
+
 #ifndef TEST
 #define TEST avx_test
 #endif
 
-#include CHECK
-#include "vnniint8-auto-vectorize-1.c"
-
+#ifndef N
 #define N 256
+#endif
+
 char a_i8[N], b_i8[N];
 unsigned char c_u8[N], d_u8[N];
 int i8_exp, i8_ref;
diff --git a/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c
new file mode 100644
index 00000000000..bbb49e81b69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-3.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile } */                                     
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler "vpdpbssd\t" } } */
+/* { dg-final { scan-assembler "vpdpbuud\t" } } */
+
+#include "vnniint8-auto-vectorize-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c
new file mode 100644
index 00000000000..41098b1abcd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vnniint8-auto-vectorize-4.c
@@ -0,0 +1,15 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx10.2-512" } */
+/* { dg-require-effective-target avx10_2_512 } */
+
+#ifndef CHECK
+#define CHECK "avx512f-check.h"
+#endif
+
+#define N 512
+
+#define AVX10_2
+#define AVX10_2_512
+#define AVX10_512BIT
+#define AVX512F_LEN 512
+#include "vnniint8-auto-vectorize-2.c"