Message ID | 20240904025252.1894695-1-admin@levyhsu.com |
---|---|
State | New |
Headers | show |
Series | i386: Support partial signbit/xorsign/copysign/abs/neg/and/xor/ior/andn for V2BF/V4BF | expand |
On Wed, Sep 4, 2024 at 10:53 AM Levy Hsu <admin@levyhsu.com> wrote: > > Hi > > This patch adds support for bf16 operations in V2BF and V4BF modes on i386, > handling signbit, xorsign, copysign, abs, neg, and various logical operations. > > Bootstrapped and tested on x86-64-pc-linux-gnu. > Ok for trunk? Ok. > > gcc/ChangeLog: > > * config/i386/i386.cc (ix86_build_const_vector): Add V2BF/V4BF. > (ix86_build_signbit_mask): Add V2BF/V4BF. > * config/i386/mmx.md: Modified supported logic op to use VHBF_32_64. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/part-vect-absnegbf.c: New test. > --- > gcc/config/i386/i386.cc | 4 + > gcc/config/i386/mmx.md | 74 +++++++++-------- > .../gcc.target/i386/part-vect-absnegbf.c | 81 +++++++++++++++++++ > 3 files changed, 124 insertions(+), 35 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 78bf890f14b..2bbfb1bf5fc 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -16176,6 +16176,8 @@ ix86_build_const_vector (machine_mode mode, bool vect, rtx value) > case E_V32BFmode: > case E_V16BFmode: > case E_V8BFmode: > + case E_V4BFmode: > + case E_V2BFmode: > n_elt = GET_MODE_NUNITS (mode); > v = rtvec_alloc (n_elt); > scalar_mode = GET_MODE_INNER (mode); > @@ -16215,6 +16217,8 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert) > case E_V32BFmode: > case E_V16BFmode: > case E_V8BFmode: > + case E_V4BFmode: > + case E_V2BFmode: > vec_mode = mode; > imode = HImode; > break; > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index cb2697537a8..44adcd8d8e0 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -121,7 +121,7 @@ > ;; Mapping of vector float modes to an integer mode of the same size > (define_mode_attr mmxintvecmode > [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI") > - (V4HF "V4HI") (V2HF "V2HI")]) > + (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")]) > > (define_mode_attr mmxintvecmodelower > [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi") > @@ -2091,18 +2091,22 @@ > DONE; > }) > > +(define_mode_iterator VHBF_32_64 > + [V2BF (V4BF "TARGET_MMX_WITH_SSE") > + V2HF (V4HF "TARGET_MMX_WITH_SSE")]) > + > (define_expand "<code><mode>2" > - [(set (match_operand:VHF_32_64 0 "register_operand") > - (absneg:VHF_32_64 > - (match_operand:VHF_32_64 1 "register_operand")))] > + [(set (match_operand:VHBF_32_64 0 "register_operand") > + (absneg:VHBF_32_64 > + (match_operand:VHBF_32_64 1 "register_operand")))] > "TARGET_SSE" > "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;") > > (define_insn_and_split "*mmx_<code><mode>" > - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x") > - (absneg:VHF_32_64 > - (match_operand:VHF_32_64 1 "register_operand" "0,x,x"))) > - (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))] > + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x") > + (absneg:VHBF_32_64 > + (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"))) > + (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))] > "TARGET_SSE" > "#" > "&& reload_completed" > @@ -2115,11 +2119,11 @@ > [(set_attr "isa" "noavx,noavx,avx")]) > > (define_insn_and_split "*mmx_nabs<mode>2" > - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x") > - (neg:VHF_32_64 > - (abs:VHF_32_64 > - (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))) > - (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))] > + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x") > + (neg:VHBF_32_64 > + (abs:VHBF_32_64 > + (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))) > + (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))] > "TARGET_SSE" > "#" > "&& reload_completed" > @@ -2410,11 +2414,11 @@ > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > (define_insn "*mmx_andnot<mode>3" > - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x") > - (and:VHF_32_64 > - (not:VHF_32_64 > - (match_operand:VHF_32_64 1 "register_operand" "0,x")) > - (match_operand:VHF_32_64 2 "register_operand" "x,x")))] > + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x") > + (and:VHBF_32_64 > + (not:VHBF_32_64 > + (match_operand:VHBF_32_64 1 "register_operand" "0,x")) > + (match_operand:VHBF_32_64 2 "register_operand" "x,x")))] > "TARGET_SSE" > "@ > andnps\t{%2, %0|%0, %2} > @@ -2425,10 +2429,10 @@ > (set_attr "mode" "V4SF")]) > > (define_insn "<code><mode>3" > - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x") > - (any_logic:VHF_32_64 > - (match_operand:VHF_32_64 1 "register_operand" "%0,x") > - (match_operand:VHF_32_64 2 "register_operand" " x,x")))] > + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x") > + (any_logic:VHBF_32_64 > + (match_operand:VHBF_32_64 1 "register_operand" "%0,x") > + (match_operand:VHBF_32_64 2 "register_operand" " x,x")))] > "TARGET_SSE" > "@ > <logic>ps\t{%2, %0|%0, %2} > @@ -2440,14 +2444,14 @@ > > (define_expand "copysign<mode>3" > [(set (match_dup 4) > - (and:VHF_32_64 > - (not:VHF_32_64 (match_dup 3)) > - (match_operand:VHF_32_64 1 "register_operand"))) > + (and:VHBF_32_64 > + (not:VHBF_32_64 (match_dup 3)) > + (match_operand:VHBF_32_64 1 "register_operand"))) > (set (match_dup 5) > - (and:VHF_32_64 (match_dup 3) > - (match_operand:VHF_32_64 2 "register_operand"))) > - (set (match_operand:VHF_32_64 0 "register_operand") > - (ior:VHF_32_64 (match_dup 4) (match_dup 5)))] > + (and:VHBF_32_64 (match_dup 3) > + (match_operand:VHBF_32_64 2 "register_operand"))) > + (set (match_operand:VHBF_32_64 0 "register_operand") > + (ior:VHBF_32_64 (match_dup 4) (match_dup 5)))] > "TARGET_SSE" > { > operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false); > @@ -2458,11 +2462,11 @@ > > (define_expand "xorsign<mode>3" > [(set (match_dup 4) > - (and:VHF_32_64 (match_dup 3) > - (match_operand:VHF_32_64 2 "register_operand"))) > - (set (match_operand:VHF_32_64 0 "register_operand") > - (xor:VHF_32_64 (match_dup 4) > - (match_operand:VHF_32_64 1 "register_operand")))] > + (and:VHBF_32_64 (match_dup 3) > + (match_operand:VHBF_32_64 2 "register_operand"))) > + (set (match_operand:VHBF_32_64 0 "register_operand") > + (xor:VHBF_32_64 (match_dup 4) > + (match_operand:VHBF_32_64 1 "register_operand")))] > "TARGET_SSE" > { > operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false); > @@ -2474,7 +2478,7 @@ > [(set (match_operand:<mmxintvecmode> 0 "register_operand") > (lshiftrt:<mmxintvecmode> > (subreg:<mmxintvecmode> > - (match_operand:VHF_32_64 1 "register_operand") 0) > + (match_operand:VHBF_32_64 1 "register_operand") 0) > (match_dup 2)))] > "TARGET_SSE2" > { > diff --git a/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c > new file mode 100644 > index 00000000000..2d7ae35298e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c > @@ -0,0 +1,81 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1 -fdump-tree-vect-details -fdump-tree-slp-details -fdump-tree-optimized" } */ > + > +extern void abort (void); > +static void do_test (void); > + > +#define DO_TEST do_test > +#define AVX512BF16 > +#include "avx512-check.h" > + > +__bf16 b_32[2], r_abs_32[2], r_neg_32[2]; > +__bf16 b_64[4], r_abs_64[4], r_neg_64[4]; > + > +void > +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) > +abs_32 (void) > +{ > + for (int i = 0; i < 2; i++) > + r_abs_32[i] = __builtin_fabsf16 (b_32[i]); > +} > + > +void > +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) > +neg_32 (void) > +{ > + for (int i = 0; i < 2; i++) > + r_neg_32[i] = -b_32[i]; > +} > + > +void > +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) > +abs_64 (void) > +{ > + for (int i = 0; i < 4; i++) > + r_abs_64[i] = __builtin_fabsf16 (b_64[i]); > +} > + > +void > +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) > +neg_64 (void) > +{ > + for (int i = 0; i < 4; i++) > + r_neg_64[i] = -b_64[i]; > +} > + > +void > +check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len) > +{ > + for (int i = 0; i < len; i++) > + { > + __bf16 expected_abs = __builtin_fabsf16 (b[i]); > + __bf16 expected_neg = -b[i]; > + if (r_abs[i] != expected_abs || r_neg[i] != expected_neg) > + abort (); > + } > +} > + > +static void > +__attribute__ ((noinline, noclone)) > +do_test (void) > +{ > + float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f}; > + > + for (int i = 0; i < 2; i++) > + b_32[i] = (__bf16) float_b[i]; > + > + for (int i = 0; i < 4; i++) > + b_64[i] = (__bf16) float_b[i]; > + > + abs_32 (); > + neg_32 (); > + check_absneg_results (b_32, r_abs_32, r_neg_32, 2); > + > + abs_64 (); > + neg_64 (); > + check_absneg_results (b_64, r_abs_64, r_neg_64, 4); > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized using 4 byte vectors" 2 "slp1" } } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 8 byte vectors" 2 "vect" { target { ! ia32 } } } } */ > +/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { target { ! ia32 } } } } */ > -- > 2.31.1 >
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 78bf890f14b..2bbfb1bf5fc 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -16176,6 +16176,8 @@ ix86_build_const_vector (machine_mode mode, bool vect, rtx value) case E_V32BFmode: case E_V16BFmode: case E_V8BFmode: + case E_V4BFmode: + case E_V2BFmode: n_elt = GET_MODE_NUNITS (mode); v = rtvec_alloc (n_elt); scalar_mode = GET_MODE_INNER (mode); @@ -16215,6 +16217,8 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert) case E_V32BFmode: case E_V16BFmode: case E_V8BFmode: + case E_V4BFmode: + case E_V2BFmode: vec_mode = mode; imode = HImode; break; diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index cb2697537a8..44adcd8d8e0 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -121,7 +121,7 @@ ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr mmxintvecmode [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI") - (V4HF "V4HI") (V2HF "V2HI")]) + (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")]) (define_mode_attr mmxintvecmodelower [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi") @@ -2091,18 +2091,22 @@ DONE; }) +(define_mode_iterator VHBF_32_64 + [V2BF (V4BF "TARGET_MMX_WITH_SSE") + V2HF (V4HF "TARGET_MMX_WITH_SSE")]) + (define_expand "<code><mode>2" - [(set (match_operand:VHF_32_64 0 "register_operand") - (absneg:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand")))] + [(set (match_operand:VHBF_32_64 0 "register_operand") + (absneg:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand")))] "TARGET_SSE" "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;") (define_insn_and_split "*mmx_<code><mode>" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x") - (absneg:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "0,x,x"))) - (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x") + (absneg:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"))) + (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))] "TARGET_SSE" "#" "&& reload_completed" @@ -2115,11 +2119,11 @@ [(set_attr "isa" "noavx,noavx,avx")]) (define_insn_and_split "*mmx_nabs<mode>2" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x") - (neg:VHF_32_64 - (abs:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))) - (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x") + (neg:VHBF_32_64 + (abs:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))) + (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))] "TARGET_SSE" "#" "&& reload_completed" @@ -2410,11 +2414,11 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "*mmx_andnot<mode>3" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x") - (and:VHF_32_64 - (not:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "0,x")) - (match_operand:VHF_32_64 2 "register_operand" "x,x")))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x") + (and:VHBF_32_64 + (not:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "0,x")) + (match_operand:VHBF_32_64 2 "register_operand" "x,x")))] "TARGET_SSE" "@ andnps\t{%2, %0|%0, %2} @@ -2425,10 +2429,10 @@ (set_attr "mode" "V4SF")]) (define_insn "<code><mode>3" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x") - (any_logic:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "%0,x") - (match_operand:VHF_32_64 2 "register_operand" " x,x")))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x") + (any_logic:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "%0,x") + (match_operand:VHBF_32_64 2 "register_operand" " x,x")))] "TARGET_SSE" "@ <logic>ps\t{%2, %0|%0, %2} @@ -2440,14 +2444,14 @@ (define_expand "copysign<mode>3" [(set (match_dup 4) - (and:VHF_32_64 - (not:VHF_32_64 (match_dup 3)) - (match_operand:VHF_32_64 1 "register_operand"))) + (and:VHBF_32_64 + (not:VHBF_32_64 (match_dup 3)) + (match_operand:VHBF_32_64 1 "register_operand"))) (set (match_dup 5) - (and:VHF_32_64 (match_dup 3) - (match_operand:VHF_32_64 2 "register_operand"))) - (set (match_operand:VHF_32_64 0 "register_operand") - (ior:VHF_32_64 (match_dup 4) (match_dup 5)))] + (and:VHBF_32_64 (match_dup 3) + (match_operand:VHBF_32_64 2 "register_operand"))) + (set (match_operand:VHBF_32_64 0 "register_operand") + (ior:VHBF_32_64 (match_dup 4) (match_dup 5)))] "TARGET_SSE" { operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false); @@ -2458,11 +2462,11 @@ (define_expand "xorsign<mode>3" [(set (match_dup 4) - (and:VHF_32_64 (match_dup 3) - (match_operand:VHF_32_64 2 "register_operand"))) - (set (match_operand:VHF_32_64 0 "register_operand") - (xor:VHF_32_64 (match_dup 4) - (match_operand:VHF_32_64 1 "register_operand")))] + (and:VHBF_32_64 (match_dup 3) + (match_operand:VHBF_32_64 2 "register_operand"))) + (set (match_operand:VHBF_32_64 0 "register_operand") + (xor:VHBF_32_64 (match_dup 4) + (match_operand:VHBF_32_64 1 "register_operand")))] "TARGET_SSE" { operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false); @@ -2474,7 +2478,7 @@ [(set (match_operand:<mmxintvecmode> 0 "register_operand") (lshiftrt:<mmxintvecmode> (subreg:<mmxintvecmode> - (match_operand:VHF_32_64 1 "register_operand") 0) + (match_operand:VHBF_32_64 1 "register_operand") 0) (match_dup 2)))] "TARGET_SSE2" { diff --git a/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c new file mode 100644 index 00000000000..2d7ae35298e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c @@ -0,0 +1,81 @@ +/* { dg-do run } */ +/* { dg-options "-O1 -fdump-tree-vect-details -fdump-tree-slp-details -fdump-tree-optimized" } */ + +extern void abort (void); +static void do_test (void); + +#define DO_TEST do_test +#define AVX512BF16 +#include "avx512-check.h" + +__bf16 b_32[2], r_abs_32[2], r_neg_32[2]; +__bf16 b_64[4], r_abs_64[4], r_neg_64[4]; + +void +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) +abs_32 (void) +{ + for (int i = 0; i < 2; i++) + r_abs_32[i] = __builtin_fabsf16 (b_32[i]); +} + +void +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) +neg_32 (void) +{ + for (int i = 0; i < 2; i++) + r_neg_32[i] = -b_32[i]; +} + +void +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) +abs_64 (void) +{ + for (int i = 0; i < 4; i++) + r_abs_64[i] = __builtin_fabsf16 (b_64[i]); +} + +void +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf)) +neg_64 (void) +{ + for (int i = 0; i < 4; i++) + r_neg_64[i] = -b_64[i]; +} + +void +check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len) +{ + for (int i = 0; i < len; i++) + { + __bf16 expected_abs = __builtin_fabsf16 (b[i]); + __bf16 expected_neg = -b[i]; + if (r_abs[i] != expected_abs || r_neg[i] != expected_neg) + abort (); + } +} + +static void +__attribute__ ((noinline, noclone)) +do_test (void) +{ + float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f}; + + for (int i = 0; i < 2; i++) + b_32[i] = (__bf16) float_b[i]; + + for (int i = 0; i < 4; i++) + b_64[i] = (__bf16) float_b[i]; + + abs_32 (); + neg_32 (); + check_absneg_results (b_32, r_abs_32, r_neg_32, 2); + + abs_64 (); + neg_64 (); + check_absneg_results (b_64, r_abs_64, r_neg_64, 4); +} + +/* { dg-final { scan-tree-dump-times "vectorized using 4 byte vectors" 2 "slp1" } } */ +/* { dg-final { scan-tree-dump-times "loop vectorized using 8 byte vectors" 2 "vect" { target { ! ia32 } } } } */ +/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { target { ! ia32 } } } } */