diff mbox series

i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt

Message ID 20240902083300.1861771-1-admin@levyhsu.com
State New
Headers show
Series i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt | expand

Commit Message

Levy Hsu Sept. 2, 2024, 8:32 a.m. UTC
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

This patch introduces new mode iterators and expands for the i386 architecture to support partial vectorization of bf16 operations using AVX10.2 instructions. These operations include addition, subtraction, multiplication, division, and square root calculations for V2BF and V4BF data types.

gcc/ChangeLog:

	* config/i386/mmx.md (VBF_32_64): New mode iterator for partial vectorized V2BF/V4BF.
	(<insn><mode>3): New define_expand for plusminusmultdiv.
	(sqrt<mode>2): New define_expand for sqrt.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test.
	* gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New test.
---
 gcc/config/i386/mmx.md                        | 37 ++++++++++++
 .../avx10_2-partial-bf-vector-fast-math-1.c   | 22 +++++++
 .../avx10_2-partial-bf-vector-operations-1.c  | 57 +++++++++++++++++++
 3 files changed, 116 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c

Comments

Hongtao Liu Sept. 3, 2024, 2 a.m. UTC | #1
On Mon, Sep 2, 2024 at 4:33 PM Levy Hsu <admin@levyhsu.com> wrote:
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> This patch introduces new mode iterators and expands for the i386 architecture to support partial vectorization of bf16 operations using AVX10.2 instructions. These operations include addition, subtraction, multiplication, division, and square root calculations for V2BF and V4BF data types.
Ok.
>
> gcc/ChangeLog:
>
>         * config/i386/mmx.md (VBF_32_64): New mode iterator for partial vectorized V2BF/V4BF.
>         (<insn><mode>3): New define_expand for plusminusmultdiv.
>         (sqrt<mode>2): New define_expand for sqrt.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test.
>         * gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New test.
> ---
>  gcc/config/i386/mmx.md                        | 37 ++++++++++++
>  .../avx10_2-partial-bf-vector-fast-math-1.c   | 22 +++++++
>  .../avx10_2-partial-bf-vector-operations-1.c  | 57 +++++++++++++++++++
>  3 files changed, 116 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index e0065ed4d48..9116ddb5321 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -94,6 +94,8 @@
>
>  (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")])
>
> +(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
> +
>  ;; Mapping from integer vector mode to mnemonic suffix
>  (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
> @@ -2036,6 +2038,26 @@
>    DONE;
>  })
>
> +;; VDIVNEPBF16 does not generate floating point exceptions.
> +(define_expand "<insn><mode>3"
> +  [(set (match_operand:VBF_32_64 0 "register_operand")
> +    (plusminusmultdiv:VBF_32_64
> +      (match_operand:VBF_32_64 1 "nonimmediate_operand")
> +      (match_operand:VBF_32_64 2 "nonimmediate_operand")))]
> +  "TARGET_AVX10_2_256"
> +{
> +  rtx op0 = gen_reg_rtx (V8BFmode);
> +  rtx op1 = lowpart_subreg (V8BFmode,
> +                           force_reg (<MODE>mode, operands[1]), <MODE>mode);
> +  rtx op2 = lowpart_subreg (V8BFmode,
> +                           force_reg (<MODE>mode, operands[2]), <MODE>mode);
> +
> +  emit_insn (gen_<insn>v8bf3 (op0, op1, op2));
> +
> +  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
> +  DONE;
> +})
> +
>  (define_expand "divv2hf3"
>    [(set (match_operand:V2HF 0 "register_operand")
>         (div:V2HF
> @@ -2091,6 +2113,21 @@
>    DONE;
>  })
>
> +(define_expand "sqrt<mode>2"
> +  [(set (match_operand:VBF_32_64 0 "register_operand")
> +       (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))]
> +  "TARGET_AVX10_2_256"
> +{
> +  rtx op0 = gen_reg_rtx (V8BFmode);
> +  rtx op1 = lowpart_subreg (V8BFmode,
> +                           force_reg (<MODE>mode, operands[1]), <MODE>mode);
> +
> +  emit_insn (gen_sqrtv8bf2 (op0, op1));
> +
> +  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
> +  DONE;
> +})
> +
>  (define_expand "<code><mode>2"
>    [(set (match_operand:VHF_32_64 0 "register_operand")
>         (absneg:VHF_32_64
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
> new file mode 100644
> index 00000000000..fd064f17445
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx10.2 -O2" } */
> +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +
> +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
> +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
> +
> +
> +__attribute__((optimize("fast-math")))
> +v4bf
> +foo_div_fast_math_4 (v4bf a, v4bf b)
> +{
> +  return a / b;
> +}
> +
> +__attribute__((optimize("fast-math")))
> +v2bf
> +foo_div_fast_math_2 (v2bf a, v2bf b)
> +{
> +  return a / b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
> new file mode 100644
> index 00000000000..e7ee08a20a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx10.2 -O2" } */
> +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +
> +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
> +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
> +
> +v4bf
> +foo_mul_4 (v4bf a, v4bf b)
> +{
> +  return a * b;
> +}
> +
> +v4bf
> +foo_add_4 (v4bf a, v4bf b)
> +{
> +  return a + b;
> +}
> +
> +v4bf
> +foo_div_4 (v4bf a, v4bf b)
> +{
> +  return a / b;
> +}
> +
> +v4bf
> +foo_sub_4 (v4bf a, v4bf b)
> +{
> +  return a - b;
> +}
> +
> +v2bf
> +foo_mul_2 (v2bf a, v2bf b)
> +{
> +  return a * b;
> +}
> +
> +v2bf
> +foo_add_2 (v2bf a, v2bf b)
> +{
> +  return a + b;
> +}
> +
> +v2bf
> +foo_div_2 (v2bf a, v2bf b)
> +{
> +  return a / b;
> +}
> +
> +v2bf
> +foo_sub_2 (v2bf a, v2bf b)
> +{
> +  return a - b;
> +}
> --
> 2.31.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index e0065ed4d48..9116ddb5321 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -94,6 +94,8 @@ 
 
 (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")])
 
+(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
+
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
@@ -2036,6 +2038,26 @@ 
   DONE;
 })
 
+;; VDIVNEPBF16 does not generate floating point exceptions.
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+    (plusminusmultdiv:VBF_32_64
+      (match_operand:VBF_32_64 1 "nonimmediate_operand")
+      (match_operand:VBF_32_64 2 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+			    force_reg (<MODE>mode, operands[1]), <MODE>mode);
+  rtx op2 = lowpart_subreg (V8BFmode,
+			    force_reg (<MODE>mode, operands[2]), <MODE>mode);
+
+  emit_insn (gen_<insn>v8bf3 (op0, op1, op2));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "divv2hf3"
   [(set (match_operand:V2HF 0 "register_operand")
 	(div:V2HF
@@ -2091,6 +2113,21 @@ 
   DONE;
 })
 
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+	(sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+			    force_reg (<MODE>mode, operands[1]), <MODE>mode);
+
+  emit_insn (gen_sqrtv8bf2 (op0, op1));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "<code><mode>2"
   [(set (match_operand:VHF_32_64 0 "register_operand")
 	(absneg:VHF_32_64
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
new file mode 100644
index 00000000000..fd064f17445
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+
+typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
+typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
+
+
+__attribute__((optimize("fast-math")))
+v4bf
+foo_div_fast_math_4 (v4bf a, v4bf b)
+{
+  return a / b;
+}
+
+__attribute__((optimize("fast-math")))
+v2bf
+foo_div_fast_math_2 (v2bf a, v2bf b)
+{
+  return a / b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
new file mode 100644
index 00000000000..e7ee08a20a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
@@ -0,0 +1,57 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+
+typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
+typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
+
+v4bf
+foo_mul_4 (v4bf a, v4bf b)
+{
+  return a * b;
+}
+
+v4bf
+foo_add_4 (v4bf a, v4bf b)
+{
+  return a + b;
+}
+
+v4bf
+foo_div_4 (v4bf a, v4bf b)
+{
+  return a / b;
+}
+
+v4bf
+foo_sub_4 (v4bf a, v4bf b)
+{
+  return a - b;
+}
+
+v2bf
+foo_mul_2 (v2bf a, v2bf b)
+{
+  return a * b;
+}
+
+v2bf
+foo_add_2 (v2bf a, v2bf b)
+{
+  return a + b;
+}
+
+v2bf
+foo_div_2 (v2bf a, v2bf b)
+{
+  return a / b;
+}
+
+v2bf
+foo_sub_2 (v2bf a, v2bf b)
+{
+  return a - b;
+}