diff mbox series

[4/8] i386: Support vectorized BF16 add/sub/mul/div with AVX10.2 instructions

Message ID 20240826064238.2268967-5-haochen.jiang@intel.com
State New
Headers show
Series i386: Opmitize code with AVX10.2 new instructions | expand

Commit Message

Haochen Jiang Aug. 26, 2024, 6:42 a.m. UTC
From: Levy Hsu <admin@levyhsu.com>

AVX10.2 introduces several non-exception instructions for BF16 vector.
Enable vectorized BF add/sub/mul/div operation by supporting standard
optab for them.

gcc/ChangeLog:

	* config/i386/sse.md (div<mode>3): New expander for BFmode div.
	(VF_BHSD): New mode iterator with vector BFmodes.
	(<insn><mode>3<mask_name><round_name>): Change mode to VF_BHSD.
	(mul<mode>3<mask_name><round_name>): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx10_2-512-bf-vector-operations-1.c: New test.
	* gcc.target/i386/avx10_2-bf-vector-operations-1.c: Ditto.
---
 gcc/config/i386/sse.md                        | 49 ++++++++++--
 .../i386/avx10_2-512-bf-vector-operations-1.c | 42 ++++++++++
 .../i386/avx10_2-bf-vector-operations-1.c     | 79 +++++++++++++++++++
 3 files changed, 162 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 442ac93afa2..ebca462bae8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -391,6 +391,19 @@ 
    (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX")
    (V2DF "TARGET_SSE2")])
 
+(define_mode_iterator VF_BHSD
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V16SF "TARGET_AVX512F && TARGET_EVEX512")
+   (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F && TARGET_EVEX512")
+   (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")
+   (V32BF "TARGET_AVX10_2_512")
+   (V16BF "TARGET_AVX10_2_256")
+   (V8BF "TARGET_AVX10_2_256")
+  ])
+
 ;; 128-, 256- and 512-bit float vector modes for bitwise operations
 (define_mode_iterator VFB
   [(V32BF "TARGET_AVX512F && TARGET_EVEX512")
@@ -2527,10 +2540,10 @@ 
 })
 
 (define_expand "<insn><mode>3<mask_name><round_name>"
-  [(set (match_operand:VFH 0 "register_operand")
-	(plusminus:VFH
-	  (match_operand:VFH 1 "<round_nimm_predicate>")
-	  (match_operand:VFH 2 "<round_nimm_predicate>")))]
+  [(set (match_operand:VF_BHSD 0 "register_operand")
+	(plusminus:VF_BHSD
+	  (match_operand:VF_BHSD 1 "<round_nimm_predicate>")
+	  (match_operand:VF_BHSD 2 "<round_nimm_predicate>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
@@ -2616,10 +2629,10 @@ 
 })
 
 (define_expand "mul<mode>3<mask_name><round_name>"
-  [(set (match_operand:VFH 0 "register_operand")
-	(mult:VFH
-	  (match_operand:VFH 1 "<round_nimm_predicate>")
-	  (match_operand:VFH 2 "<round_nimm_predicate>")))]
+  [(set (match_operand:VF_BHSD 0 "register_operand")
+	(mult:VF_BHSD
+	  (match_operand:VF_BHSD 1 "<round_nimm_predicate>")
+	  (match_operand:VF_BHSD 2 "<round_nimm_predicate>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>"
   "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
 
@@ -2734,6 +2747,26 @@ 
     }
 })
 
+(define_expand "div<mode>3"
+  [(set (match_operand:VBF_AVX10_2 0 "register_operand")
+	(div:VBF_AVX10_2
+	  (match_operand:VBF_AVX10_2 1 "register_operand")
+	  (match_operand:VBF_AVX10_2 2 "vector_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  if (TARGET_RECIP_VEC_DIV
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only
+      && flag_unsafe_math_optimizations)
+    {
+      rtx op = gen_reg_rtx (<MODE>mode);
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+      emit_insn (gen_avx10_2_rcppbf16_<mode> (op, operands[2]));
+      emit_insn (gen_avx10_2_mulnepbf16_<mode> (operands[0], operands[1], op));
+      DONE;
+    }
+})
+
 (define_expand "cond_div<mode>"
   [(set (match_operand:VFH 0 "register_operand")
 	(vec_merge:VFH
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c
new file mode 100644
index 00000000000..d6b0750c233
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf-vector-operations-1.c
@@ -0,0 +1,42 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2-512 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+typedef __bf16 v32bf __attribute__ ((__vector_size__ (64)));
+
+v32bf
+foo_mul (v32bf a, v32bf b)
+{
+  return a * b;
+}
+
+v32bf
+foo_add (v32bf a, v32bf b)
+{
+  return a + b;
+}
+
+v32bf
+foo_div (v32bf a, v32bf b)
+{
+  return a / b;
+}
+
+v32bf
+foo_sub (v32bf a, v32bf b)
+{
+  return a - b;
+}
+
+__attribute__((optimize("fast-math")))
+v32bf
+foo_div_fast_math (v32bf a, v32bf b)
+{
+  return a / b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c
new file mode 100644
index 00000000000..77092b9fce1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-bf-vector-operations-1.c
@@ -0,0 +1,79 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+typedef __bf16 v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16)));
+
+v16bf
+foo_mul_256 (v16bf a, v16bf b)
+{
+  return a * b;
+}
+
+v16bf
+foo_add_256 (v16bf a, v16bf b)
+{
+  return a + b;
+}
+
+v16bf
+foo_div_256 (v16bf a, v16bf b)
+{
+  return a / b;
+}
+
+v16bf
+foo_sub_256 (v16bf a, v16bf b)
+{
+  return a - b;
+}
+
+__attribute__((optimize("fast-math")))
+v16bf
+foo_div_fast_math_256 (v16bf a, v16bf b)
+{
+  return a / b;
+}
+
+v8bf
+foo_mul_128 (v8bf a, v8bf b)
+{
+  return a * b;
+}
+
+v8bf
+foo_add_128 (v8bf a, v8bf b)
+{
+  return a + b;
+}
+
+v8bf
+foo_div_128 (v8bf a, v8bf b)
+{
+  return a / b;
+}
+
+v8bf
+foo_sub_128 (v8bf a, v8bf b)
+{
+  return a - b;
+}
+
+__attribute__((optimize("fast-math")))
+v8bf
+foo_div_fast_math_128 (v8bf a, v8bf b)
+{
+  return a / b;
+}