diff mbox series

i386: Support cstorebf4 with native bf16 comi

Message ID 20241107055820.684921-1-hongyu.wang@intel.com
State New
Headers show
Series i386: Support cstorebf4 with native bf16 comi | expand

Commit Message

Hongyu Wang Nov. 7, 2024, 5:58 a.m. UTC
Hi,

We recently supports cbranchbf4 with AVX10_2 native bf16 comi
instructions, so do similar to cstorebf4.

Bootstrapped & regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

	* config/i386/i386.md (cstorebf4): Use vcomsbf16 under
	TARGET_AVX10_2_256 and -fno-trapping-math.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx10_2-comibf-3.c: New test.
	* gcc.target/i386/avx10_2-comibf-4.c: Likewise.
---
 gcc/config/i386/i386.md                       | 18 +++++---
 .../gcc.target/i386/avx10_2-comibf-3.c        | 27 ++++++++++++
 .../gcc.target/i386/avx10_2-comibf-4.c        | 41 +++++++++++++++++++
 3 files changed, 80 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c

Comments

Uros Bizjak Nov. 7, 2024, 7:22 a.m. UTC | #1
On Thu, Nov 7, 2024 at 6:58 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> Hi,
>
> We recently supports cbranchbf4 with AVX10_2 native bf16 comi
> instructions, so do similar to cstorebf4.
>
> Bootstrapped & regtested on x86_64-pc-linux-gnu.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         * config/i386/i386.md (cstorebf4): Use vcomsbf16 under
>         TARGET_AVX10_2_256 and -fno-trapping-math.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx10_2-comibf-3.c: New test.
>         * gcc.target/i386/avx10_2-comibf-4.c: Likewise.

OK.

While there, can you please also fix formatting of new code in
cbranchbf4? There is no need for curly braces and alignment is wrong.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.md                       | 18 +++++---
>  .../gcc.target/i386/avx10_2-comibf-3.c        | 27 ++++++++++++
>  .../gcc.target/i386/avx10_2-comibf-4.c        | 41 +++++++++++++++++++
>  3 files changed, 80 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index c492fe55881..b5ba75ef8e7 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1860,12 +1860,18 @@ (define_expand "cstorebf4"
>            (const_int 0)]))]
>    "TARGET_80387 || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)"
>  {
> -  rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]);
> -  rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]);
> -  rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]),
> -                                  op1, op2, SFmode, 0, 1);
> -  if (!rtx_equal_p (res, operands[0]))
> -    emit_move_insn (operands[0], res);
> +  if (TARGET_AVX10_2_256 && !flag_trapping_math)
> +    ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
> +                      operands[2], operands[3]);
> +  else
> +    {
> +      rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]);
> +      rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]);
> +      rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]),
> +                                      op1, op2, SFmode, 0, 1);
> +      if (!rtx_equal_p (res, operands[0]))
> +      emit_move_insn (operands[0], res);
> +    }
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c
> new file mode 100644
> index 00000000000..afa41a3f071
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -O2" } */
> +
> +/* { dg-final { scan-assembler-times "vcomsbf16\[ \\t\]+\[^{}\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 6 } } */
> +/* { dg-final { scan-assembler-times "set\[aeglnb\]+" 6 } } */
> +
> +#define AVX10_ATTR \
> +__attribute__((noinline, __target__("avx10.2"), optimize("no-trapping-math")))
> +
> +AVX10_ATTR
> +int foo1_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
> +{
> +  return a == b && c < d;
> +}
> +
> +AVX10_ATTR
> +int foo2_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
> +{
> +  return a > b || c != d;
> +}
> +
> +AVX10_ATTR
> +int foo3_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
> +{
> +  return (a >= b) * (c <= d);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c
> new file mode 100644
> index 00000000000..18848ddb5e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run { target { avx10_2 } } } */
> +/* { dg-options "-march=x86-64-v3 -O2" } */
> +
> +#include "avx10_2-comibf-3.c"
> +
> +__attribute__((noinline))
> +int foo1 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
> +{
> +  return a == b && c < d;
> +}
> +
> +__attribute__((noinline))
> +int foo2 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
> +{
> +  return a > b || c != d;
> +}
> +
> +__attribute__((noinline))
> +int foo3 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
> +{
> +  return (a >= b) * (c <= d);
> +}
> +
> +
> +int main (void)
> +{
> +  if (!__builtin_cpu_supports ("avx10.2"))
> +    return 0;
> +
> +  __bf16 a = 0.5bf16, b = -0.25bf16, c = 1.75bf16, d = -0.125bf16;
> +
> +  if (foo1_avx10 (a, b, c, d) != foo1 (a, b, c, d))
> +    __builtin_abort ();
> +
> +  if (foo2_avx10 (b, c, d, a) != foo2 (b, c, d, a))
> +    __builtin_abort ();
> +
> +  if (foo3_avx10 (c, d, a, b) != foo3 (c, d, a, b))
> +    __builtin_abort ();
> +}
> +
> --
> 2.31.1
>
Hongyu Wang Nov. 7, 2024, 8:15 a.m. UTC | #2
Uros Bizjak <ubizjak@gmail.com> 于2024年11月7日周四 15:22写道:
>
> On Thu, Nov 7, 2024 at 6:58 AM Hongyu Wang <hongyu.wang@intel.com> wrote:
> >
> > Hi,
> >
> > We recently supports cbranchbf4 with AVX10_2 native bf16 comi
> > instructions, so do similar to cstorebf4.
> >
> > Bootstrapped & regtested on x86_64-pc-linux-gnu.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         * config/i386/i386.md (cstorebf4): Use vcomsbf16 under
> >         TARGET_AVX10_2_256 and -fno-trapping-math.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/avx10_2-comibf-3.c: New test.
> >         * gcc.target/i386/avx10_2-comibf-4.c: Likewise.
>
> OK.
>
> While there, can you please also fix formatting of new code in
> cbranchbf4? There is no need for curly braces and alignment is wrong.
>
Yes, the attached patch is what I'm pusing. Thanks.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c492fe55881..b5ba75ef8e7 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1860,12 +1860,18 @@  (define_expand "cstorebf4"
 	   (const_int 0)]))]
   "TARGET_80387 || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)"
 {
-  rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]);
-  rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]);
-  rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]),
-				   op1, op2, SFmode, 0, 1);
-  if (!rtx_equal_p (res, operands[0]))
-    emit_move_insn (operands[0], res);
+  if (TARGET_AVX10_2_256 && !flag_trapping_math)
+    ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+		       operands[2], operands[3]);
+  else
+    {
+      rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]);
+      rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]);
+      rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]),
+				       op1, op2, SFmode, 0, 1);
+      if (!rtx_equal_p (res, operands[0]))
+      emit_move_insn (operands[0], res);
+    }
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c
new file mode 100644
index 00000000000..afa41a3f071
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+
+/* { dg-final { scan-assembler-times "vcomsbf16\[ \\t\]+\[^{}\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 6 } } */
+/* { dg-final { scan-assembler-times "set\[aeglnb\]+" 6 } } */
+
+#define AVX10_ATTR \
+__attribute__((noinline, __target__("avx10.2"), optimize("no-trapping-math")))
+
+AVX10_ATTR
+int foo1_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
+{
+  return a == b && c < d;
+}
+
+AVX10_ATTR
+int foo2_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
+{
+  return a > b || c != d;
+}
+
+AVX10_ATTR
+int foo3_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
+{
+  return (a >= b) * (c <= d);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c
new file mode 100644
index 00000000000..18848ddb5e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c
@@ -0,0 +1,41 @@ 
+/* { dg-do run { target { avx10_2 } } } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+
+#include "avx10_2-comibf-3.c"
+
+__attribute__((noinline))
+int foo1 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
+{
+  return a == b && c < d;
+}
+
+__attribute__((noinline))
+int foo2 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
+{
+  return a > b || c != d;
+}
+
+__attribute__((noinline))
+int foo3 (__bf16 a, __bf16 b, __bf16 c, __bf16 d)
+{
+  return (a >= b) * (c <= d);
+}
+
+
+int main (void)
+{
+  if (!__builtin_cpu_supports ("avx10.2"))
+    return 0;
+
+  __bf16 a = 0.5bf16, b = -0.25bf16, c = 1.75bf16, d = -0.125bf16;
+
+  if (foo1_avx10 (a, b, c, d) != foo1 (a, b, c, d))
+    __builtin_abort ();
+
+  if (foo2_avx10 (b, c, d, a) != foo2 (b, c, d, a))
+    __builtin_abort ();
+  
+  if (foo3_avx10 (c, d, a, b) != foo3 (c, d, a, b))
+    __builtin_abort ();
+}
+