Message ID | 20230807085701.302936-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | i386: Clear upper bits of XMM register for V4HFmode/V2HFmode operations [PR110762] | expand |
On Mon, Aug 7, 2023 at 10:57 AM liuhongt <hongtao.liu@intel.com> wrote: > > Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > PR target/110762 > * config/i386/mmx.md (<insn><mode>3): Changed from define_insn > to define_expand and break into .. > (<insn>v4hf3): .. this. > (divv4hf3): .. this. > (<insn>v2hf3): .. this. > (divv2hf3): .. this. > (movd_v2hf_to_sse): New define_expand. > (movq_<mode>_to_sse): Extend to V4HFmode. > (mmxdoublevecmode): Ditto. > (V2FI_V4HF): New mode iterator. > * config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF > by using mode iterator V4SF_V8HF, renamed to .. > (*vec_concat<mode>): .. this. > (*vec_concatv4sf_0): Extend to handle V8HF by using mode > iterator V4SF_V8HF, renamed to .. > (*vec_concat<mode>_0): .. this. > (*vec_concatv8hf_movss): New define_insn. > (V4SF_V8HF): New mode iterator. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr110762-v4hf.c: New test. LGTM. Please also note the RFC patch [1] that relaxes clears for V2SFmode with -fno-trapping-math. The patched compiler will then emit the same code as clang does for -O2. Which raises another question - should gcc default to -fno-trapping-math? [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625795.html Thanks, Uros. > --- > gcc/config/i386/mmx.md | 109 +++++++++++++++--- > gcc/config/i386/sse.md | 40 +++++-- > gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++ > 3 files changed, 177 insertions(+), 29 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 896af76a33f..88bdf084f54 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64 > ;; V2S* modes > (define_mode_iterator V2FI [V2SF V2SI]) > > -;; 4-byte and 8-byte float16 vector modes > -(define_mode_iterator VHF_32_64 [V4HF V2HF]) > - > +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF]) > ;; Mapping from integer vector mode to mnemonic suffix > (define_mode_attr mmxvecsize > [(V8QI "b") (V4QI "b") (V2QI "b") > @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower > > ;; Mapping of vector modes to a vector mode of double size > (define_mode_attr mmxdoublevecmode > - [(V2SF "V4SF") (V2SI "V4SI")]) > + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")]) > > ;; Mapping of vector modes back to the scalar modes > (define_mode_attr mmxscalarmode > @@ -594,7 +592,7 @@ (define_insn "sse_movntq" > (define_expand "movq_<mode>_to_sse" > [(set (match_operand:<mmxdoublevecmode> 0 "register_operand") > (vec_concat:<mmxdoublevecmode> > - (match_operand:V2FI 1 "nonimmediate_operand") > + (match_operand:V2FI_V4HF 1 "nonimmediate_operand") > (match_dup 2)))] > "TARGET_SSE2" > "operands[2] = CONST0_RTX (<MODE>mode);") > @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2" > ;; > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > -(define_insn "<insn><mode>3" > - [(set (match_operand:VHF_32_64 0 "register_operand" "=v") > - (plusminusmultdiv:VHF_32_64 > - (match_operand:VHF_32_64 1 "register_operand" "<comm>v") > - (match_operand:VHF_32_64 2 "register_operand" "v")))] > +(define_expand "<insn>v4hf3" > + [(set (match_operand:V4HF 0 "register_operand") > + (plusminusmult:V4HF > + (match_operand:V4HF 1 "nonimmediate_operand") > + (match_operand:V4HF 2 "nonimmediate_operand")))] > "TARGET_AVX512FP16 && TARGET_AVX512VL" > - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}" > - [(set (attr "type") > - (cond [(match_test "<CODE> == MULT") > - (const_string "ssemul") > - (match_test "<CODE> == DIV") > - (const_string "ssediv")] > - (const_string "sseadd"))) > - (set_attr "prefix" "evex") > - (set_attr "mode" "V8HF")]) > +{ > + rtx op2 = gen_reg_rtx (V8HFmode); > + rtx op1 = gen_reg_rtx (V8HFmode); > + rtx op0 = gen_reg_rtx (V8HFmode); > + > + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2])); > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); > + > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); > + > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); > + DONE; > +}) > + > +(define_expand "divv4hf3" > + [(set (match_operand:V4HF 0 "register_operand") > + (div:V4HF > + (match_operand:V4HF 1 "nonimmediate_operand") > + (match_operand:V4HF 2 "nonimmediate_operand")))] > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > +{ > + rtx op2 = gen_reg_rtx (V8HFmode); > + rtx op1 = gen_reg_rtx (V8HFmode); > + rtx op0 = gen_reg_rtx (V8HFmode); > + > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); > + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2], > + force_reg (V4HFmode, CONST1_RTX (V4HFmode))); > + emit_insn (gen_rtx_SET (op2, tmp)); > + emit_insn (gen_divv8hf3 (op0, op1, op2)); > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); > + DONE; > +}) > + > +(define_expand "movd_v2hf_to_sse" > + [(set (match_operand:V8HF 0 "register_operand") > + (vec_merge:V8HF > + (vec_duplicate:V8HF > + (match_operand:V2HF 1 "nonimmediate_operand")) > + (match_operand:V8HF 2 "reg_or_0_operand") > + (const_int 3)))] > + "TARGET_SSE") > + > +(define_expand "<insn>v2hf3" > + [(set (match_operand:V2HF 0 "register_operand") > + (plusminusmult:V2HF > + (match_operand:V2HF 1 "nonimmediate_operand") > + (match_operand:V2HF 2 "nonimmediate_operand")))] > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > +{ > + rtx op2 = gen_reg_rtx (V8HFmode); > + rtx op1 = gen_reg_rtx (V8HFmode); > + rtx op0 = gen_reg_rtx (V8HFmode); > + > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode))); > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); > + > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); > + DONE; > +}) > + > +(define_expand "divv2hf3" > + [(set (match_operand:V2HF 0 "register_operand") > + (div:V2HF > + (match_operand:V2HF 1 "nonimmediate_operand") > + (match_operand:V2HF 2 "nonimmediate_operand")))] > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > +{ > + rtx op2 = gen_reg_rtx (V8HFmode); > + rtx op1 = gen_reg_rtx (V8HFmode); > + rtx op0 = gen_reg_rtx (V8HFmode); > + > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], > + force_reg (V8HFmode, CONST1_RTX (V8HFmode)))); > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); > + emit_insn (gen_divv8hf3 (op0, op1, op2)); > + > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); > + DONE; > +}) > + > > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > ;; > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index ab455c3e297..7383a50ee0d 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -430,6 +430,9 @@ (define_mode_iterator VF_512 > (define_mode_iterator VFB_512 > [V32HF V16SF V8DF]) > > +(define_mode_iterator V4SF_V8HF > + [V4SF V8HF]) > + > (define_mode_iterator VI48_AVX512VL > [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") > V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) > @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse" > (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") > (set_attr "mode" "V4SF,SF,DI,DI")]) > > -(define_insn "*vec_concatv4sf" > - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v") > - (vec_concat:V4SF > - (match_operand:V2SF 1 "register_operand" " 0,v,0,v") > - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))] > +(define_insn "*vec_concat<mode>" > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v") > + (vec_concat:V4SF_V8HF > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v") > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))] > "TARGET_SSE" > "@ > movlhps\t{%2, %0|%0, %2} > @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf" > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) > > -(define_insn "*vec_concatv4sf_0" > - [(set (match_operand:V4SF 0 "register_operand" "=v") > - (vec_concat:V4SF > - (match_operand:V2SF 1 "nonimmediate_operand" "vm") > - (match_operand:V2SF 2 "const0_operand")))] > +(define_insn "*vec_concat<mode>_0" > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v") > + (vec_concat:V4SF_V8HF > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") > + (match_operand:<ssehalfvecmode> 2 "const0_operand")))] > "TARGET_SSE2" > "%vmovq\t{%1, %0|%0, %1}" > [(set_attr "type" "ssemov") > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "DF")]) > > +(define_insn "*vec_concatv8hf_movss" > + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v") > + (vec_merge:V8HF > + (vec_duplicate:V8HF > + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v")) > + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" ) > + (const_int 3)))] > + "TARGET_SSE" > + "@ > + movss\t{%2, %0|%0, %2} > + %vmovss\t{%2, %0|%0, %2} > + vmovss\t{%2, %1, %0|%0, %1, %2}" > + [(set_attr "isa" "noavx,*,avx") > + (set_attr "type" "ssemov") > + (set_attr "prefix" "orig,maybe_vex,maybe_vex") > + (set_attr "mode" "SF")]) > + > ;; Avoid combining registers from different units in a single alternative, > ;; see comment above inline_secondary_memory_needed function in i386.cc > (define_insn "vec_set<mode>_0" > diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > new file mode 100644 > index 00000000000..332784ac694 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > @@ -0,0 +1,57 @@ > +/* PR target/110762 */ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */ > + > +typedef _Float16 v4hf __attribute__((vector_size(8))); > +typedef _Float16 v2hf __attribute__((vector_size(4))); > + > +v4hf > +foo (v4hf a, v4hf b) > +{ > + return a + b; > +} > + > +v4hf > +foo2 (v4hf a, v4hf b) > +{ > + return a - b; > +} > + > +v4hf > +foo3 (v4hf a, v4hf b) > +{ > + return a * b; > +} > + > +v4hf > +foo1 (v4hf a, v4hf b) > +{ > + return a / b; > +} > + > +v2hf > +foo4 (v2hf a, v2hf b) > +{ > + return a + b; > +} > + > +v2hf > +foo5 (v2hf a, v2hf b) > +{ > + return a - b; > +} > + > +v2hf > +foo6 (v2hf a, v2hf b) > +{ > + return a * b; > +} > + > +v2hf > +foo7 (v2hf a, v2hf b) > +{ > + return a / b; > +} > + > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */ > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */ > -- > 2.31.1 >
On Mon, Aug 7, 2023 at 11:20 AM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Mon, Aug 7, 2023 at 10:57 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > > > gcc/ChangeLog: > > > > PR target/110762 > > * config/i386/mmx.md (<insn><mode>3): Changed from define_insn > > to define_expand and break into .. > > (<insn>v4hf3): .. this. > > (divv4hf3): .. this. > > (<insn>v2hf3): .. this. > > (divv2hf3): .. this. > > (movd_v2hf_to_sse): New define_expand. > > (movq_<mode>_to_sse): Extend to V4HFmode. > > (mmxdoublevecmode): Ditto. > > (V2FI_V4HF): New mode iterator. > > * config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF > > by using mode iterator V4SF_V8HF, renamed to .. > > (*vec_concat<mode>): .. this. > > (*vec_concatv4sf_0): Extend to handle V8HF by using mode > > iterator V4SF_V8HF, renamed to .. > > (*vec_concat<mode>_0): .. this. > > (*vec_concatv8hf_movss): New define_insn. > > (V4SF_V8HF): New mode iterator. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr110762-v4hf.c: New test. > > LGTM. > > Please also note the RFC patch [1] that relaxes clears for V2SFmode > with -fno-trapping-math. The patched compiler will then emit the same > code as clang does for -O2. Which raises another question - should gcc > default to -fno-trapping-math? I think we discussed this before and yes, IMHO we should default to -fno-trapping-math at least for C/C++ to be consistent with our other handling of the FP environment (default to -fno-rounding-math) and lack of proper FENV access barriers for inspecting the exceptions. Note Fortran has the -ffpe-trap= option which would then need to make sure to also enable -ftrapping-math. Ada might have similar constraints (it also uses -fnon-call-exceptions, but unless it enables CPU traps for FP exceptions that would be a no-op). Note this also shows we should possibly separate maintaining the IEEE exception state and considering changes in the IEEE exception states to cause CPU traps (that's also a source of common confusion on the user side). Richard. > [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625795.html > > Thanks, > Uros. > > > --- > > gcc/config/i386/mmx.md | 109 +++++++++++++++--- > > gcc/config/i386/sse.md | 40 +++++-- > > gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++ > > 3 files changed, 177 insertions(+), 29 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > > index 896af76a33f..88bdf084f54 100644 > > --- a/gcc/config/i386/mmx.md > > +++ b/gcc/config/i386/mmx.md > > @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64 > > ;; V2S* modes > > (define_mode_iterator V2FI [V2SF V2SI]) > > > > -;; 4-byte and 8-byte float16 vector modes > > -(define_mode_iterator VHF_32_64 [V4HF V2HF]) > > - > > +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF]) > > ;; Mapping from integer vector mode to mnemonic suffix > > (define_mode_attr mmxvecsize > > [(V8QI "b") (V4QI "b") (V2QI "b") > > @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower > > > > ;; Mapping of vector modes to a vector mode of double size > > (define_mode_attr mmxdoublevecmode > > - [(V2SF "V4SF") (V2SI "V4SI")]) > > + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")]) > > > > ;; Mapping of vector modes back to the scalar modes > > (define_mode_attr mmxscalarmode > > @@ -594,7 +592,7 @@ (define_insn "sse_movntq" > > (define_expand "movq_<mode>_to_sse" > > [(set (match_operand:<mmxdoublevecmode> 0 "register_operand") > > (vec_concat:<mmxdoublevecmode> > > - (match_operand:V2FI 1 "nonimmediate_operand") > > + (match_operand:V2FI_V4HF 1 "nonimmediate_operand") > > (match_dup 2)))] > > "TARGET_SSE2" > > "operands[2] = CONST0_RTX (<MODE>mode);") > > @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2" > > ;; > > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > > > -(define_insn "<insn><mode>3" > > - [(set (match_operand:VHF_32_64 0 "register_operand" "=v") > > - (plusminusmultdiv:VHF_32_64 > > - (match_operand:VHF_32_64 1 "register_operand" "<comm>v") > > - (match_operand:VHF_32_64 2 "register_operand" "v")))] > > +(define_expand "<insn>v4hf3" > > + [(set (match_operand:V4HF 0 "register_operand") > > + (plusminusmult:V4HF > > + (match_operand:V4HF 1 "nonimmediate_operand") > > + (match_operand:V4HF 2 "nonimmediate_operand")))] > > "TARGET_AVX512FP16 && TARGET_AVX512VL" > > - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}" > > - [(set (attr "type") > > - (cond [(match_test "<CODE> == MULT") > > - (const_string "ssemul") > > - (match_test "<CODE> == DIV") > > - (const_string "ssediv")] > > - (const_string "sseadd"))) > > - (set_attr "prefix" "evex") > > - (set_attr "mode" "V8HF")]) > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2])); > > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); > > + > > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); > > + > > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > +(define_expand "divv4hf3" > > + [(set (match_operand:V4HF 0 "register_operand") > > + (div:V4HF > > + (match_operand:V4HF 1 "nonimmediate_operand") > > + (match_operand:V4HF 2 "nonimmediate_operand")))] > > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); > > + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2], > > + force_reg (V4HFmode, CONST1_RTX (V4HFmode))); > > + emit_insn (gen_rtx_SET (op2, tmp)); > > + emit_insn (gen_divv8hf3 (op0, op1, op2)); > > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > +(define_expand "movd_v2hf_to_sse" > > + [(set (match_operand:V8HF 0 "register_operand") > > + (vec_merge:V8HF > > + (vec_duplicate:V8HF > > + (match_operand:V2HF 1 "nonimmediate_operand")) > > + (match_operand:V8HF 2 "reg_or_0_operand") > > + (const_int 3)))] > > + "TARGET_SSE") > > + > > +(define_expand "<insn>v2hf3" > > + [(set (match_operand:V2HF 0 "register_operand") > > + (plusminusmult:V2HF > > + (match_operand:V2HF 1 "nonimmediate_operand") > > + (match_operand:V2HF 2 "nonimmediate_operand")))] > > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode))); > > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); > > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); > > + > > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > +(define_expand "divv2hf3" > > + [(set (match_operand:V2HF 0 "register_operand") > > + (div:V2HF > > + (match_operand:V2HF 1 "nonimmediate_operand") > > + (match_operand:V2HF 2 "nonimmediate_operand")))] > > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], > > + force_reg (V8HFmode, CONST1_RTX (V8HFmode)))); > > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); > > + emit_insn (gen_divv8hf3 (op0, op1, op2)); > > + > > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > > > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > ;; > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index ab455c3e297..7383a50ee0d 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -430,6 +430,9 @@ (define_mode_iterator VF_512 > > (define_mode_iterator VFB_512 > > [V32HF V16SF V8DF]) > > > > +(define_mode_iterator V4SF_V8HF > > + [V4SF V8HF]) > > + > > (define_mode_iterator VI48_AVX512VL > > [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") > > V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) > > @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse" > > (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") > > (set_attr "mode" "V4SF,SF,DI,DI")]) > > > > -(define_insn "*vec_concatv4sf" > > - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v") > > - (vec_concat:V4SF > > - (match_operand:V2SF 1 "register_operand" " 0,v,0,v") > > - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))] > > +(define_insn "*vec_concat<mode>" > > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v") > > + (vec_concat:V4SF_V8HF > > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v") > > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))] > > "TARGET_SSE" > > "@ > > movlhps\t{%2, %0|%0, %2} > > @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf" > > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") > > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) > > > > -(define_insn "*vec_concatv4sf_0" > > - [(set (match_operand:V4SF 0 "register_operand" "=v") > > - (vec_concat:V4SF > > - (match_operand:V2SF 1 "nonimmediate_operand" "vm") > > - (match_operand:V2SF 2 "const0_operand")))] > > +(define_insn "*vec_concat<mode>_0" > > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v") > > + (vec_concat:V4SF_V8HF > > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") > > + (match_operand:<ssehalfvecmode> 2 "const0_operand")))] > > "TARGET_SSE2" > > "%vmovq\t{%1, %0|%0, %1}" > > [(set_attr "type" "ssemov") > > (set_attr "prefix" "maybe_vex") > > (set_attr "mode" "DF")]) > > > > +(define_insn "*vec_concatv8hf_movss" > > + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v") > > + (vec_merge:V8HF > > + (vec_duplicate:V8HF > > + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v")) > > + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" ) > > + (const_int 3)))] > > + "TARGET_SSE" > > + "@ > > + movss\t{%2, %0|%0, %2} > > + %vmovss\t{%2, %0|%0, %2} > > + vmovss\t{%2, %1, %0|%0, %1, %2}" > > + [(set_attr "isa" "noavx,*,avx") > > + (set_attr "type" "ssemov") > > + (set_attr "prefix" "orig,maybe_vex,maybe_vex") > > + (set_attr "mode" "SF")]) > > + > > ;; Avoid combining registers from different units in a single alternative, > > ;; see comment above inline_secondary_memory_needed function in i386.cc > > (define_insn "vec_set<mode>_0" > > diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > new file mode 100644 > > index 00000000000..332784ac694 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > @@ -0,0 +1,57 @@ > > +/* PR target/110762 */ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */ > > + > > +typedef _Float16 v4hf __attribute__((vector_size(8))); > > +typedef _Float16 v2hf __attribute__((vector_size(4))); > > + > > +v4hf > > +foo (v4hf a, v4hf b) > > +{ > > + return a + b; > > +} > > + > > +v4hf > > +foo2 (v4hf a, v4hf b) > > +{ > > + return a - b; > > +} > > + > > +v4hf > > +foo3 (v4hf a, v4hf b) > > +{ > > + return a * b; > > +} > > + > > +v4hf > > +foo1 (v4hf a, v4hf b) > > +{ > > + return a / b; > > +} > > + > > +v2hf > > +foo4 (v2hf a, v2hf b) > > +{ > > + return a + b; > > +} > > + > > +v2hf > > +foo5 (v2hf a, v2hf b) > > +{ > > + return a - b; > > +} > > + > > +v2hf > > +foo6 (v2hf a, v2hf b) > > +{ > > + return a * b; > > +} > > + > > +v2hf > > +foo7 (v2hf a, v2hf b) > > +{ > > + return a / b; > > +} > > + > > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */ > > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */ > > -- > > 2.31.1 > >
On Mon, Aug 7, 2023 at 5:19 PM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Mon, Aug 7, 2023 at 10:57 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > > > gcc/ChangeLog: > > > > PR target/110762 > > * config/i386/mmx.md (<insn><mode>3): Changed from define_insn > > to define_expand and break into .. > > (<insn>v4hf3): .. this. > > (divv4hf3): .. this. > > (<insn>v2hf3): .. this. > > (divv2hf3): .. this. > > (movd_v2hf_to_sse): New define_expand. > > (movq_<mode>_to_sse): Extend to V4HFmode. > > (mmxdoublevecmode): Ditto. > > (V2FI_V4HF): New mode iterator. > > * config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF > > by using mode iterator V4SF_V8HF, renamed to .. > > (*vec_concat<mode>): .. this. > > (*vec_concatv4sf_0): Extend to handle V8HF by using mode > > iterator V4SF_V8HF, renamed to .. > > (*vec_concat<mode>_0): .. this. > > (*vec_concatv8hf_movss): New define_insn. > > (V4SF_V8HF): New mode iterator. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr110762-v4hf.c: New test. > > LGTM. > > Please also note the RFC patch [1] that relaxes clears for V2SFmode > with -fno-trapping-math. The patched compiler will then emit the same > code as clang does for -O2. Which raises another question - should gcc > default to -fno-trapping-math? > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625795.html > I can create another patch to handle my parts for -fno-trapping-math optimization. > Thanks, > Uros. > > > --- > > gcc/config/i386/mmx.md | 109 +++++++++++++++--- > > gcc/config/i386/sse.md | 40 +++++-- > > gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++ > > 3 files changed, 177 insertions(+), 29 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > > index 896af76a33f..88bdf084f54 100644 > > --- a/gcc/config/i386/mmx.md > > +++ b/gcc/config/i386/mmx.md > > @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64 > > ;; V2S* modes > > (define_mode_iterator V2FI [V2SF V2SI]) > > > > -;; 4-byte and 8-byte float16 vector modes > > -(define_mode_iterator VHF_32_64 [V4HF V2HF]) > > - > > +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF]) > > ;; Mapping from integer vector mode to mnemonic suffix > > (define_mode_attr mmxvecsize > > [(V8QI "b") (V4QI "b") (V2QI "b") > > @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower > > > > ;; Mapping of vector modes to a vector mode of double size > > (define_mode_attr mmxdoublevecmode > > - [(V2SF "V4SF") (V2SI "V4SI")]) > > + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")]) > > > > ;; Mapping of vector modes back to the scalar modes > > (define_mode_attr mmxscalarmode > > @@ -594,7 +592,7 @@ (define_insn "sse_movntq" > > (define_expand "movq_<mode>_to_sse" > > [(set (match_operand:<mmxdoublevecmode> 0 "register_operand") > > (vec_concat:<mmxdoublevecmode> > > - (match_operand:V2FI 1 "nonimmediate_operand") > > + (match_operand:V2FI_V4HF 1 "nonimmediate_operand") > > (match_dup 2)))] > > "TARGET_SSE2" > > "operands[2] = CONST0_RTX (<MODE>mode);") > > @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2" > > ;; > > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > > > -(define_insn "<insn><mode>3" > > - [(set (match_operand:VHF_32_64 0 "register_operand" "=v") > > - (plusminusmultdiv:VHF_32_64 > > - (match_operand:VHF_32_64 1 "register_operand" "<comm>v") > > - (match_operand:VHF_32_64 2 "register_operand" "v")))] > > +(define_expand "<insn>v4hf3" > > + [(set (match_operand:V4HF 0 "register_operand") > > + (plusminusmult:V4HF > > + (match_operand:V4HF 1 "nonimmediate_operand") > > + (match_operand:V4HF 2 "nonimmediate_operand")))] > > "TARGET_AVX512FP16 && TARGET_AVX512VL" > > - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}" > > - [(set (attr "type") > > - (cond [(match_test "<CODE> == MULT") > > - (const_string "ssemul") > > - (match_test "<CODE> == DIV") > > - (const_string "ssediv")] > > - (const_string "sseadd"))) > > - (set_attr "prefix" "evex") > > - (set_attr "mode" "V8HF")]) > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2])); > > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); > > + > > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); > > + > > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > +(define_expand "divv4hf3" > > + [(set (match_operand:V4HF 0 "register_operand") > > + (div:V4HF > > + (match_operand:V4HF 1 "nonimmediate_operand") > > + (match_operand:V4HF 2 "nonimmediate_operand")))] > > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); > > + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2], > > + force_reg (V4HFmode, CONST1_RTX (V4HFmode))); > > + emit_insn (gen_rtx_SET (op2, tmp)); > > + emit_insn (gen_divv8hf3 (op0, op1, op2)); > > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > +(define_expand "movd_v2hf_to_sse" > > + [(set (match_operand:V8HF 0 "register_operand") > > + (vec_merge:V8HF > > + (vec_duplicate:V8HF > > + (match_operand:V2HF 1 "nonimmediate_operand")) > > + (match_operand:V8HF 2 "reg_or_0_operand") > > + (const_int 3)))] > > + "TARGET_SSE") > > + > > +(define_expand "<insn>v2hf3" > > + [(set (match_operand:V2HF 0 "register_operand") > > + (plusminusmult:V2HF > > + (match_operand:V2HF 1 "nonimmediate_operand") > > + (match_operand:V2HF 2 "nonimmediate_operand")))] > > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode))); > > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); > > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); > > + > > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > +(define_expand "divv2hf3" > > + [(set (match_operand:V2HF 0 "register_operand") > > + (div:V2HF > > + (match_operand:V2HF 1 "nonimmediate_operand") > > + (match_operand:V2HF 2 "nonimmediate_operand")))] > > + "TARGET_AVX512FP16 && TARGET_AVX512VL" > > +{ > > + rtx op2 = gen_reg_rtx (V8HFmode); > > + rtx op1 = gen_reg_rtx (V8HFmode); > > + rtx op0 = gen_reg_rtx (V8HFmode); > > + > > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], > > + force_reg (V8HFmode, CONST1_RTX (V8HFmode)))); > > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); > > + emit_insn (gen_divv8hf3 (op0, op1, op2)); > > + > > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); > > + DONE; > > +}) > > + > > > > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > ;; > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index ab455c3e297..7383a50ee0d 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -430,6 +430,9 @@ (define_mode_iterator VF_512 > > (define_mode_iterator VFB_512 > > [V32HF V16SF V8DF]) > > > > +(define_mode_iterator V4SF_V8HF > > + [V4SF V8HF]) > > + > > (define_mode_iterator VI48_AVX512VL > > [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") > > V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) > > @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse" > > (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") > > (set_attr "mode" "V4SF,SF,DI,DI")]) > > > > -(define_insn "*vec_concatv4sf" > > - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v") > > - (vec_concat:V4SF > > - (match_operand:V2SF 1 "register_operand" " 0,v,0,v") > > - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))] > > +(define_insn "*vec_concat<mode>" > > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v") > > + (vec_concat:V4SF_V8HF > > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v") > > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))] > > "TARGET_SSE" > > "@ > > movlhps\t{%2, %0|%0, %2} > > @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf" > > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") > > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) > > > > -(define_insn "*vec_concatv4sf_0" > > - [(set (match_operand:V4SF 0 "register_operand" "=v") > > - (vec_concat:V4SF > > - (match_operand:V2SF 1 "nonimmediate_operand" "vm") > > - (match_operand:V2SF 2 "const0_operand")))] > > +(define_insn "*vec_concat<mode>_0" > > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v") > > + (vec_concat:V4SF_V8HF > > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") > > + (match_operand:<ssehalfvecmode> 2 "const0_operand")))] > > "TARGET_SSE2" > > "%vmovq\t{%1, %0|%0, %1}" > > [(set_attr "type" "ssemov") > > (set_attr "prefix" "maybe_vex") > > (set_attr "mode" "DF")]) > > > > +(define_insn "*vec_concatv8hf_movss" > > + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v") > > + (vec_merge:V8HF > > + (vec_duplicate:V8HF > > + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v")) > > + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" ) > > + (const_int 3)))] > > + "TARGET_SSE" > > + "@ > > + movss\t{%2, %0|%0, %2} > > + %vmovss\t{%2, %0|%0, %2} > > + vmovss\t{%2, %1, %0|%0, %1, %2}" > > + [(set_attr "isa" "noavx,*,avx") > > + (set_attr "type" "ssemov") > > + (set_attr "prefix" "orig,maybe_vex,maybe_vex") > > + (set_attr "mode" "SF")]) > > + > > ;; Avoid combining registers from different units in a single alternative, > > ;; see comment above inline_secondary_memory_needed function in i386.cc > > (define_insn "vec_set<mode>_0" > > diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > new file mode 100644 > > index 00000000000..332784ac694 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c > > @@ -0,0 +1,57 @@ > > +/* PR target/110762 */ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */ > > + > > +typedef _Float16 v4hf __attribute__((vector_size(8))); > > +typedef _Float16 v2hf __attribute__((vector_size(4))); > > + > > +v4hf > > +foo (v4hf a, v4hf b) > > +{ > > + return a + b; > > +} > > + > > +v4hf > > +foo2 (v4hf a, v4hf b) > > +{ > > + return a - b; > > +} > > + > > +v4hf > > +foo3 (v4hf a, v4hf b) > > +{ > > + return a * b; > > +} > > + > > +v4hf > > +foo1 (v4hf a, v4hf b) > > +{ > > + return a / b; > > +} > > + > > +v2hf > > +foo4 (v2hf a, v2hf b) > > +{ > > + return a + b; > > +} > > + > > +v2hf > > +foo5 (v2hf a, v2hf b) > > +{ > > + return a - b; > > +} > > + > > +v2hf > > +foo6 (v2hf a, v2hf b) > > +{ > > + return a * b; > > +} > > + > > +v2hf > > +foo7 (v2hf a, v2hf b) > > +{ > > + return a / b; > > +} > > + > > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */ > > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */ > > -- > > 2.31.1 > >
On Mon, Aug 7, 2023 at 1:20 PM Richard Biener <richard.guenther@gmail.com> wrote: > > Please also note the RFC patch [1] that relaxes clears for V2SFmode > > with -fno-trapping-math. The patched compiler will then emit the same > > code as clang does for -O2. Which raises another question - should gcc > > default to -fno-trapping-math? > > I think we discussed this before and yes, IMHO we should default to > -fno-trapping-math at least for C/C++ to be consistent with our other > handling of the FP environment (default to -fno-rounding-math) and > lack of proper FENV access barriers for inspecting the exceptions. > > Note Fortran has the -ffpe-trap= option which would then need to make > sure to also enable -ftrapping-math. Ada might have similar constraints > (it also uses -fnon-call-exceptions, but unless it enables CPU traps for > FP exceptions that would be a no-op). Note this also shows we should > possibly separate maintaining the IEEE exception state and considering > changes in the IEEE exception states to cause CPU traps (that's also > a source of common confusion on the user side). FTR: PR54192, "-fno-trapping-math by default?" [1] [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54192 Uros.
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 896af76a33f..88bdf084f54 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64 ;; V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) -;; 4-byte and 8-byte float16 vector modes -(define_mode_iterator VHF_32_64 [V4HF V2HF]) - +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF]) ;; Mapping from integer vector mode to mnemonic suffix (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V2QI "b") @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower ;; Mapping of vector modes to a vector mode of double size (define_mode_attr mmxdoublevecmode - [(V2SF "V4SF") (V2SI "V4SI")]) + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")]) ;; Mapping of vector modes back to the scalar modes (define_mode_attr mmxscalarmode @@ -594,7 +592,7 @@ (define_insn "sse_movntq" (define_expand "movq_<mode>_to_sse" [(set (match_operand:<mmxdoublevecmode> 0 "register_operand") (vec_concat:<mmxdoublevecmode> - (match_operand:V2FI 1 "nonimmediate_operand") + (match_operand:V2FI_V4HF 1 "nonimmediate_operand") (match_dup 2)))] "TARGET_SSE2" "operands[2] = CONST0_RTX (<MODE>mode);") @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2" ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(define_insn "<insn><mode>3" - [(set (match_operand:VHF_32_64 0 "register_operand" "=v") - (plusminusmultdiv:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "<comm>v") - (match_operand:VHF_32_64 2 "register_operand" "v")))] +(define_expand "<insn>v4hf3" + [(set (match_operand:V4HF 0 "register_operand") + (plusminusmult:V4HF + (match_operand:V4HF 1 "nonimmediate_operand") + (match_operand:V4HF 2 "nonimmediate_operand")))] "TARGET_AVX512FP16 && TARGET_AVX512VL" - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}" - [(set (attr "type") - (cond [(match_test "<CODE> == MULT") - (const_string "ssemul") - (match_test "<CODE> == DIV") - (const_string "ssediv")] - (const_string "sseadd"))) - (set_attr "prefix" "evex") - (set_attr "mode" "V8HF")]) +{ + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2])); + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); + + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); + + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); + DONE; +}) + +(define_expand "divv4hf3" + [(set (match_operand:V4HF 0 "register_operand") + (div:V4HF + (match_operand:V4HF 1 "nonimmediate_operand") + (match_operand:V4HF 2 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1])); + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2], + force_reg (V4HFmode, CONST1_RTX (V4HFmode))); + emit_insn (gen_rtx_SET (op2, tmp)); + emit_insn (gen_divv8hf3 (op0, op1, op2)); + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode)); + DONE; +}) + +(define_expand "movd_v2hf_to_sse" + [(set (match_operand:V8HF 0 "register_operand") + (vec_merge:V8HF + (vec_duplicate:V8HF + (match_operand:V2HF 1 "nonimmediate_operand")) + (match_operand:V8HF 2 "reg_or_0_operand") + (const_int 3)))] + "TARGET_SSE") + +(define_expand "<insn>v2hf3" + [(set (match_operand:V2HF 0 "register_operand") + (plusminusmult:V2HF + (match_operand:V2HF 1 "nonimmediate_operand") + (match_operand:V2HF 2 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode))); + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); + emit_insn (gen_<insn>v8hf3 (op0, op1, op2)); + + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); + DONE; +}) + +(define_expand "divv2hf3" + [(set (match_operand:V2HF 0 "register_operand") + (div:V2HF + (match_operand:V2HF 1 "nonimmediate_operand") + (match_operand:V2HF 2 "nonimmediate_operand")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" +{ + rtx op2 = gen_reg_rtx (V8HFmode); + rtx op1 = gen_reg_rtx (V8HFmode); + rtx op0 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], + force_reg (V8HFmode, CONST1_RTX (V8HFmode)))); + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode))); + emit_insn (gen_divv8hf3 (op0, op1, op2)); + + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ab455c3e297..7383a50ee0d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -430,6 +430,9 @@ (define_mode_iterator VF_512 (define_mode_iterator VFB_512 [V32HF V16SF V8DF]) +(define_mode_iterator V4SF_V8HF + [V4SF V8HF]) + (define_mode_iterator VI48_AVX512VL [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse" (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") (set_attr "mode" "V4SF,SF,DI,DI")]) -(define_insn "*vec_concatv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v") - (vec_concat:V4SF - (match_operand:V2SF 1 "register_operand" " 0,v,0,v") - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))] +(define_insn "*vec_concat<mode>" + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v") + (vec_concat:V4SF_V8HF + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v") + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))] "TARGET_SSE" "@ movlhps\t{%2, %0|%0, %2} @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf" (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) -(define_insn "*vec_concatv4sf_0" - [(set (match_operand:V4SF 0 "register_operand" "=v") - (vec_concat:V4SF - (match_operand:V2SF 1 "nonimmediate_operand" "vm") - (match_operand:V2SF 2 "const0_operand")))] +(define_insn "*vec_concat<mode>_0" + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v") + (vec_concat:V4SF_V8HF + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm") + (match_operand:<ssehalfvecmode> 2 "const0_operand")))] "TARGET_SSE2" "%vmovq\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "maybe_vex") (set_attr "mode" "DF")]) +(define_insn "*vec_concatv8hf_movss" + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v") + (vec_merge:V8HF + (vec_duplicate:V8HF + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v")) + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" ) + (const_int 3)))] + "TARGET_SSE" + "@ + movss\t{%2, %0|%0, %2} + %vmovss\t{%2, %0|%0, %2} + vmovss\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,*,avx") + (set_attr "type" "ssemov") + (set_attr "prefix" "orig,maybe_vex,maybe_vex") + (set_attr "mode" "SF")]) + ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.cc (define_insn "vec_set<mode>_0" diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c new file mode 100644 index 00000000000..332784ac694 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c @@ -0,0 +1,57 @@ +/* PR target/110762 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */ + +typedef _Float16 v4hf __attribute__((vector_size(8))); +typedef _Float16 v2hf __attribute__((vector_size(4))); + +v4hf +foo (v4hf a, v4hf b) +{ + return a + b; +} + +v4hf +foo2 (v4hf a, v4hf b) +{ + return a - b; +} + +v4hf +foo3 (v4hf a, v4hf b) +{ + return a * b; +} + +v4hf +foo1 (v4hf a, v4hf b) +{ + return a / b; +} + +v2hf +foo4 (v2hf a, v2hf b) +{ + return a + b; +} + +v2hf +foo5 (v2hf a, v2hf b) +{ + return a - b; +} + +v2hf +foo6 (v2hf a, v2hf b) +{ + return a * b; +} + +v2hf +foo7 (v2hf a, v2hf b) +{ + return a / b; +} + +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */ +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */