Message ID | 20241107055820.684921-1-hongyu.wang@intel.com |
---|---|
State | New |
Headers | show |
Series | i386: Support cstorebf4 with native bf16 comi | expand |
On Thu, Nov 7, 2024 at 6:58 AM Hongyu Wang <hongyu.wang@intel.com> wrote: > > Hi, > > We recently supports cbranchbf4 with AVX10_2 native bf16 comi > instructions, so do similar to cstorebf4. > > Bootstrapped & regtested on x86_64-pc-linux-gnu. > Ok for trunk? > > gcc/ChangeLog: > > * config/i386/i386.md (cstorebf4): Use vcomsbf16 under > TARGET_AVX10_2_256 and -fno-trapping-math. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx10_2-comibf-3.c: New test. > * gcc.target/i386/avx10_2-comibf-4.c: Likewise. OK. While there, can you please also fix formatting of new code in cbranchbf4? There is no need for curly braces and alignment is wrong. Thanks, Uros. > --- > gcc/config/i386/i386.md | 18 +++++--- > .../gcc.target/i386/avx10_2-comibf-3.c | 27 ++++++++++++ > .../gcc.target/i386/avx10_2-comibf-4.c | 41 +++++++++++++++++++ > 3 files changed, 80 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index c492fe55881..b5ba75ef8e7 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -1860,12 +1860,18 @@ (define_expand "cstorebf4" > (const_int 0)]))] > "TARGET_80387 || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)" > { > - rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]); > - rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]); > - rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]), > - op1, op2, SFmode, 0, 1); > - if (!rtx_equal_p (res, operands[0])) > - emit_move_insn (operands[0], res); > + if (TARGET_AVX10_2_256 && !flag_trapping_math) > + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3]); > + else > + { > + rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]); > + rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]); > + rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]), > + op1, op2, SFmode, 0, 1); > + if (!rtx_equal_p (res, operands[0])) > + emit_move_insn (operands[0], res); > + } > DONE; > }) > > diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c > new file mode 100644 > index 00000000000..afa41a3f071 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v3 -O2" } */ > + > +/* { dg-final { scan-assembler-times "vcomsbf16\[ \\t\]+\[^{}\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 6 } } */ > +/* { dg-final { scan-assembler-times "set\[aeglnb\]+" 6 } } */ > + > +#define AVX10_ATTR \ > +__attribute__((noinline, __target__("avx10.2"), optimize("no-trapping-math"))) > + > +AVX10_ATTR > +int foo1_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) > +{ > + return a == b && c < d; > +} > + > +AVX10_ATTR > +int foo2_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) > +{ > + return a > b || c != d; > +} > + > +AVX10_ATTR > +int foo3_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) > +{ > + return (a >= b) * (c <= d); > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c > new file mode 100644 > index 00000000000..18848ddb5e9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c > @@ -0,0 +1,41 @@ > +/* { dg-do run { target { avx10_2 } } } */ > +/* { dg-options "-march=x86-64-v3 -O2" } */ > + > +#include "avx10_2-comibf-3.c" > + > +__attribute__((noinline)) > +int foo1 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) > +{ > + return a == b && c < d; > +} > + > +__attribute__((noinline)) > +int foo2 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) > +{ > + return a > b || c != d; > +} > + > +__attribute__((noinline)) > +int foo3 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) > +{ > + return (a >= b) * (c <= d); > +} > + > + > +int main (void) > +{ > + if (!__builtin_cpu_supports ("avx10.2")) > + return 0; > + > + __bf16 a = 0.5bf16, b = -0.25bf16, c = 1.75bf16, d = -0.125bf16; > + > + if (foo1_avx10 (a, b, c, d) != foo1 (a, b, c, d)) > + __builtin_abort (); > + > + if (foo2_avx10 (b, c, d, a) != foo2 (b, c, d, a)) > + __builtin_abort (); > + > + if (foo3_avx10 (c, d, a, b) != foo3 (c, d, a, b)) > + __builtin_abort (); > +} > + > -- > 2.31.1 >
Uros Bizjak <ubizjak@gmail.com> 于2024年11月7日周四 15:22写道: > > On Thu, Nov 7, 2024 at 6:58 AM Hongyu Wang <hongyu.wang@intel.com> wrote: > > > > Hi, > > > > We recently supports cbranchbf4 with AVX10_2 native bf16 comi > > instructions, so do similar to cstorebf4. > > > > Bootstrapped & regtested on x86_64-pc-linux-gnu. > > Ok for trunk? > > > > gcc/ChangeLog: > > > > * config/i386/i386.md (cstorebf4): Use vcomsbf16 under > > TARGET_AVX10_2_256 and -fno-trapping-math. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/avx10_2-comibf-3.c: New test. > > * gcc.target/i386/avx10_2-comibf-4.c: Likewise. > > OK. > > While there, can you please also fix formatting of new code in > cbranchbf4? There is no need for curly braces and alignment is wrong. > Yes, the attached patch is what I'm pusing. Thanks.
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c492fe55881..b5ba75ef8e7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1860,12 +1860,18 @@ (define_expand "cstorebf4" (const_int 0)]))] "TARGET_80387 || (SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)" { - rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]); - rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]); - rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]), - op1, op2, SFmode, 0, 1); - if (!rtx_equal_p (res, operands[0])) - emit_move_insn (operands[0], res); + if (TARGET_AVX10_2_256 && !flag_trapping_math) + ix86_expand_setcc (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + else + { + rtx op1 = ix86_expand_fast_convert_bf_to_sf (operands[2]); + rtx op2 = ix86_expand_fast_convert_bf_to_sf (operands[3]); + rtx res = emit_store_flag_force (operands[0], GET_CODE (operands[1]), + op1, op2, SFmode, 0, 1); + if (!rtx_equal_p (res, operands[0])) + emit_move_insn (operands[0], res); + } DONE; }) diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c new file mode 100644 index 00000000000..afa41a3f071 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-3.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -O2" } */ + +/* { dg-final { scan-assembler-times "vcomsbf16\[ \\t\]+\[^{}\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 6 } } */ +/* { dg-final { scan-assembler-times "set\[aeglnb\]+" 6 } } */ + +#define AVX10_ATTR \ +__attribute__((noinline, __target__("avx10.2"), optimize("no-trapping-math"))) + +AVX10_ATTR +int foo1_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) +{ + return a == b && c < d; +} + +AVX10_ATTR +int foo2_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) +{ + return a > b || c != d; +} + +AVX10_ATTR +int foo3_avx10 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) +{ + return (a >= b) * (c <= d); +} + diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c new file mode 100644 index 00000000000..18848ddb5e9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-4.c @@ -0,0 +1,41 @@ +/* { dg-do run { target { avx10_2 } } } */ +/* { dg-options "-march=x86-64-v3 -O2" } */ + +#include "avx10_2-comibf-3.c" + +__attribute__((noinline)) +int foo1 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) +{ + return a == b && c < d; +} + +__attribute__((noinline)) +int foo2 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) +{ + return a > b || c != d; +} + +__attribute__((noinline)) +int foo3 (__bf16 a, __bf16 b, __bf16 c, __bf16 d) +{ + return (a >= b) * (c <= d); +} + + +int main (void) +{ + if (!__builtin_cpu_supports ("avx10.2")) + return 0; + + __bf16 a = 0.5bf16, b = -0.25bf16, c = 1.75bf16, d = -0.125bf16; + + if (foo1_avx10 (a, b, c, d) != foo1 (a, b, c, d)) + __builtin_abort (); + + if (foo2_avx10 (b, c, d, a) != foo2 (b, c, d, a)) + __builtin_abort (); + + if (foo3_avx10 (c, d, a, b) != foo3 (c, d, a, b)) + __builtin_abort (); +} +