diff mbox series

[i386] Optimize movzwl + vmovd/vmovq to vmovw.

Message ID 20220509020313.8835-1-hongtao.liu@intel.com
State New
Headers show
Series [i386] Optimize movzwl + vmovd/vmovq to vmovw. | expand

Commit Message

liuhongt May 9, 2022, 2:03 a.m. UTC
Similarly optimize movl + vmovq to vmovd.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/104915
	* config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
	pre_reload define_insn_and_split.
	(*vec_setv2di_0_zero_extendhi_1): Ditto.
	(*vec_set<mode>_0_zero_extendsi): Ditto.
	(*vec_setv2di_0_zero_extendsi_1): Ditto.
	(ssewvecmode): New mode attr.
	(ssewvecmodelower): Ditto.
	(ssepackmodelower): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr104915-vmovd.c: New test.
	* gcc.target/i386/pr104915-vmovw.c: New test.
---
 gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
 .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
 .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c

Comments

Uros Bizjak May 9, 2022, 8:28 a.m. UTC | #1
On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Similarly optimize movl + vmovq to vmovd.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/104915
>         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
>         pre_reload define_insn_and_split.
>         (*vec_setv2di_0_zero_extendhi_1): Ditto.
>         (*vec_set<mode>_0_zero_extendsi): Ditto.
>         (*vec_setv2di_0_zero_extendsi_1): Ditto.
>         (ssewvecmode): New mode attr.
>         (ssewvecmodelower): Ditto.
>         (ssepackmodelower): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr104915-vmovd.c: New test.
>         * gcc.target/i386/pr104915-vmovw.c: New test.

I wonder if these define_insn_and_splits can instead be implemented
via combine splitter (which has the unfortunate limitation that the
output sequence has to be exactly two instructions, which is true in
your case). Combine splitter is preferred, since it splits immediately
and the resulting insns can be combined further during the combine
pass.

Uros.

> ---
>  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
>  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
>  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
>  3 files changed, 164 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 7b791def542..2ad8a2b46b8 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
>     (V32HI "V32HI") (V64QI "V64QI")
>     (V32QI "V32QI") (V16QI "V16QI")])
>
> +;; Mapping of vector modes to an V*HImode of the same size
> +(define_mode_attr ssewvecmode
> +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> +
> +(define_mode_attr ssewvecmodelower
> +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> +
>  (define_mode_attr sseintvecmode2
>    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
>     (V8SF "OI") (V4SF "TI")
> @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
>     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
>     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
>
> +(define_mode_attr ssepackmodelower
> +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> +
>  ;; Mapping of the max integer size for xop rotate immediate constraint
>  (define_mode_attr sserotatemax
>    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "HF")])
>
> +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> +       (vec_merge:VI48_AVX512F
> +        (vec_duplicate:VI48_AVX512F
> +         (zero_extend:<ssescalarmode>
> +           (match_operand:HI 1 "nonimmediate_operand")))
> +        (match_operand:VI48_AVX512F 2 "const0_operand")
> +        (const_int 1)))]
> +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> +                                             CONST0_RTX (<ssewvecmode>mode),
> +                                             operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> +  DONE;
> +})
> +
> +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> +  [(set (match_operand:V2DI 0 "register_operand")
> +       (vec_concat:V2DI
> +         (zero_extend:DI
> +           (match_operand:HI 1 "nonimmediate_operand"))
> +         (const_int 0)))]
> +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (V8HImode);
> +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (V2DImode, dest, V8HImode));
> +  DONE;
> +})
> +
>  (define_insn "avx512fp16_movsh"
>    [(set (match_operand:V8HF 0 "register_operand" "=v")
>         (vec_merge:V8HF
> @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
>            ]
>            (symbol_ref "true")))])
>
> +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> +  [(set (match_operand:VI8 0 "register_operand")
> +       (vec_merge:VI8
> +        (vec_duplicate:VI8
> +         (zero_extend:DI
> +           (match_operand:SI 1 "nonimmediate_operand")))
> +        (match_operand:VI8 2 "const0_operand")
> +        (const_int 1)))]
> +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> +                                             CONST0_RTX (<ssepackmode>mode),
> +                                             operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> +  DONE;
> +})
> +
> +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> +  [(set (match_operand:V2DI 0 "register_operand")
> +       (vec_concat:V2DI
> +         (zero_extend:DI
> +           (match_operand:SI 1 "nonimmediate_operand"))
> +         (const_int 0)))]
> +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx dest = gen_reg_rtx (V4SImode);
> +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
> +  emit_move_insn (operands[0],
> +                 lowpart_subreg (V2DImode, dest, V4SImode));
> +  DONE;
> +})
> +
>  (define_insn "sse4_1_insertps"
>    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
>         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> new file mode 100644
> index 00000000000..913ff8806f1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> +
> +#include<immintrin.h>
> +
> +__m128i
> +foo1 (int* p)
> +{
> +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> +}
> +
> +__m256i
> +foo3 (int* p)
> +{
> +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
> +}
> +
> +__m512i
> +foo5 (int* p)
> +{
> +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned int) ((*(__m32_u *)p)[0]));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> new file mode 100644
> index 00000000000..ac47865d17a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx512fp16 -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> +
> +#include<immintrin.h>
> +__m128i
> +foo (short* p)
> +{
> +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m128i
> +foo1 (short* p)
> +{
> +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m256i
> +foo2 (short* p)
> +{
> +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m256i
> +foo3 (short* p)
> +{
> +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m512i
> +foo4 (short* p)
> +{
> +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> +                          0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> +
> +__m512i
> +foo5 (short* p)
> +{
> +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> +                          (unsigned short) ((*(__m16_u *)p)[0]));
> +}
> --
> 2.18.1
>
Hongtao Liu May 11, 2022, 3:39 a.m. UTC | #2
On Mon, May 9, 2022 at 4:28 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Similarly optimize movl + vmovq to vmovd.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR target/104915
> >         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
> >         pre_reload define_insn_and_split.
> >         (*vec_setv2di_0_zero_extendhi_1): Ditto.
> >         (*vec_set<mode>_0_zero_extendsi): Ditto.
> >         (*vec_setv2di_0_zero_extendsi_1): Ditto.
> >         (ssewvecmode): New mode attr.
> >         (ssewvecmodelower): Ditto.
> >         (ssepackmodelower): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr104915-vmovd.c: New test.
> >         * gcc.target/i386/pr104915-vmovw.c: New test.
>
> I wonder if these define_insn_and_splits can instead be implemented
> via combine splitter (which has the unfortunate limitation that the
> output sequence has to be exactly two instructions, which is true in
> your case). Combine splitter is preferred, since it splits immediately
> and the resulting insns can be combined further during the combine
> pass.

try_combine requires at least 3 insns to go into combine_split_insns,
here we just have 2 insns and failed.

-----cut from combine.cc--------
3545  /* If we were combining three insns and the result is a simple SET
 3546     with no ASM_OPERANDS that wasn't recognized, try to split it into two
 3547     insns.  There are two ways to do this.  It can be split using a
 3548     machine-specific method (like when you have an addition of a large
 3549     constant) or by combine in the function find_split_point.  */
 3550
 3551=>if (i1 && insn_code_number < 0 && GET_CODE (newpat) == SET
 3552      && asm_noperands (newpat) < 0)
-------cut end-------------

>
> Uros.
>
> > ---
> >  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
> >  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
> >  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
> >  3 files changed, 164 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 7b791def542..2ad8a2b46b8 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
> >     (V32HI "V32HI") (V64QI "V64QI")
> >     (V32QI "V32QI") (V16QI "V16QI")])
> >
> > +;; Mapping of vector modes to an V*HImode of the same size
> > +(define_mode_attr ssewvecmode
> > +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> > +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> > +
> > +(define_mode_attr ssewvecmodelower
> > +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> > +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> > +
> >  (define_mode_attr sseintvecmode2
> >    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
> >     (V8SF "OI") (V4SF "TI")
> > @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
> >     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
> >     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
> >
> > +(define_mode_attr ssepackmodelower
> > +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> > +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> > +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> > +
> >  ;; Mapping of the max integer size for xop rotate immediate constraint
> >  (define_mode_attr sserotatemax
> >    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> > @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
> >     (set_attr "prefix" "evex")
> >     (set_attr "mode" "HF")])
> >
> > +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> > +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > +       (vec_merge:VI48_AVX512F
> > +        (vec_duplicate:VI48_AVX512F
> > +         (zero_extend:<ssescalarmode>
> > +           (match_operand:HI 1 "nonimmediate_operand")))
> > +        (match_operand:VI48_AVX512F 2 "const0_operand")
> > +        (const_int 1)))]
> > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> > +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> > +                                             CONST0_RTX (<ssewvecmode>mode),
> > +                                             operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> > +  DONE;
> > +})
> > +
> > +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> > +  [(set (match_operand:V2DI 0 "register_operand")
> > +       (vec_concat:V2DI
> > +         (zero_extend:DI
> > +           (match_operand:HI 1 "nonimmediate_operand"))
> > +         (const_int 0)))]
> > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (V8HImode);
> > +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (V2DImode, dest, V8HImode));
> > +  DONE;
> > +})
> > +
> >  (define_insn "avx512fp16_movsh"
> >    [(set (match_operand:V8HF 0 "register_operand" "=v")
> >         (vec_merge:V8HF
> > @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
> >            ]
> >            (symbol_ref "true")))])
> >
> > +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> > +  [(set (match_operand:VI8 0 "register_operand")
> > +       (vec_merge:VI8
> > +        (vec_duplicate:VI8
> > +         (zero_extend:DI
> > +           (match_operand:SI 1 "nonimmediate_operand")))
> > +        (match_operand:VI8 2 "const0_operand")
> > +        (const_int 1)))]
> > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> > +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> > +                                             CONST0_RTX (<ssepackmode>mode),
> > +                                             operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> > +  DONE;
> > +})
> > +
> > +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> > +  [(set (match_operand:V2DI 0 "register_operand")
> > +       (vec_concat:V2DI
> > +         (zero_extend:DI
> > +           (match_operand:SI 1 "nonimmediate_operand"))
> > +         (const_int 0)))]
> > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(const_int 0)]
> > +{
> > +  rtx dest = gen_reg_rtx (V4SImode);
> > +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
> > +  emit_move_insn (operands[0],
> > +                 lowpart_subreg (V2DImode, dest, V4SImode));
> > +  DONE;
> > +})
> > +
> >  (define_insn "sse4_1_insertps"
> >    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
> >         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > new file mode 100644
> > index 00000000000..913ff8806f1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-mavx512f -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> > +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> > +
> > +#include<immintrin.h>
> > +
> > +__m128i
> > +foo1 (int* p)
> > +{
> > +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> > +}
> > +
> > +__m256i
> > +foo3 (int* p)
> > +{
> > +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
> > +}
> > +
> > +__m512i
> > +foo5 (int* p)
> > +{
> > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned int) ((*(__m32_u *)p)[0]));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > new file mode 100644
> > index 00000000000..ac47865d17a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > @@ -0,0 +1,45 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-mavx512fp16 -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> > +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> > +
> > +#include<immintrin.h>
> > +__m128i
> > +foo (short* p)
> > +{
> > +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m128i
> > +foo1 (short* p)
> > +{
> > +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m256i
> > +foo2 (short* p)
> > +{
> > +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m256i
> > +foo3 (short* p)
> > +{
> > +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m512i
> > +foo4 (short* p)
> > +{
> > +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> > +                          0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > +
> > +__m512i
> > +foo5 (short* p)
> > +{
> > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > +}
> > --
> > 2.18.1
> >
Uros Bizjak May 11, 2022, 6:37 a.m. UTC | #3
On Wed, May 11, 2022 at 5:39 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 4:28 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > Similarly optimize movl + vmovq to vmovd.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/104915
> > >         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
> > >         pre_reload define_insn_and_split.
> > >         (*vec_setv2di_0_zero_extendhi_1): Ditto.
> > >         (*vec_set<mode>_0_zero_extendsi): Ditto.
> > >         (*vec_setv2di_0_zero_extendsi_1): Ditto.
> > >         (ssewvecmode): New mode attr.
> > >         (ssewvecmodelower): Ditto.
> > >         (ssepackmodelower): Ditto.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr104915-vmovd.c: New test.
> > >         * gcc.target/i386/pr104915-vmovw.c: New test.

OK.

Thanks,
Uros.

> >
> > I wonder if these define_insn_and_splits can instead be implemented
> > via combine splitter (which has the unfortunate limitation that the
> > output sequence has to be exactly two instructions, which is true in
> > your case). Combine splitter is preferred, since it splits immediately
> > and the resulting insns can be combined further during the combine
> > pass.
>
> try_combine requires at least 3 insns to go into combine_split_insns,
> here we just have 2 insns and failed.
>
> -----cut from combine.cc--------
> 3545  /* If we were combining three insns and the result is a simple SET
>  3546     with no ASM_OPERANDS that wasn't recognized, try to split it into two
>  3547     insns.  There are two ways to do this.  It can be split using a
>  3548     machine-specific method (like when you have an addition of a large
>  3549     constant) or by combine in the function find_split_point.  */
>  3550
>  3551=>if (i1 && insn_code_number < 0 && GET_CODE (newpat) == SET
>  3552      && asm_noperands (newpat) < 0)
> -------cut end-------------
>
> >
> > Uros.
> >
> > > ---
> > >  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
> > >  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
> > >  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
> > >  3 files changed, 164 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > >
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 7b791def542..2ad8a2b46b8 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
> > >     (V32HI "V32HI") (V64QI "V64QI")
> > >     (V32QI "V32QI") (V16QI "V16QI")])
> > >
> > > +;; Mapping of vector modes to an V*HImode of the same size
> > > +(define_mode_attr ssewvecmode
> > > +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> > > +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> > > +
> > > +(define_mode_attr ssewvecmodelower
> > > +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> > > +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> > > +
> > >  (define_mode_attr sseintvecmode2
> > >    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
> > >     (V8SF "OI") (V4SF "TI")
> > > @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
> > >     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
> > >     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
> > >
> > > +(define_mode_attr ssepackmodelower
> > > +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> > > +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> > > +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> > > +
> > >  ;; Mapping of the max integer size for xop rotate immediate constraint
> > >  (define_mode_attr sserotatemax
> > >    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> > > @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
> > >     (set_attr "prefix" "evex")
> > >     (set_attr "mode" "HF")])
> > >
> > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> > > +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > > +       (vec_merge:VI48_AVX512F
> > > +        (vec_duplicate:VI48_AVX512F
> > > +         (zero_extend:<ssescalarmode>
> > > +           (match_operand:HI 1 "nonimmediate_operand")))
> > > +        (match_operand:VI48_AVX512F 2 "const0_operand")
> > > +        (const_int 1)))]
> > > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> > > +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> > > +                                             CONST0_RTX (<ssewvecmode>mode),
> > > +                                             operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> > > +  [(set (match_operand:V2DI 0 "register_operand")
> > > +       (vec_concat:V2DI
> > > +         (zero_extend:DI
> > > +           (match_operand:HI 1 "nonimmediate_operand"))
> > > +         (const_int 0)))]
> > > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (V8HImode);
> > > +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (V2DImode, dest, V8HImode));
> > > +  DONE;
> > > +})
> > > +
> > >  (define_insn "avx512fp16_movsh"
> > >    [(set (match_operand:V8HF 0 "register_operand" "=v")
> > >         (vec_merge:V8HF
> > > @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
> > >            ]
> > >            (symbol_ref "true")))])
> > >
> > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> > > +  [(set (match_operand:VI8 0 "register_operand")
> > > +       (vec_merge:VI8
> > > +        (vec_duplicate:VI8
> > > +         (zero_extend:DI
> > > +           (match_operand:SI 1 "nonimmediate_operand")))
> > > +        (match_operand:VI8 2 "const0_operand")
> > > +        (const_int 1)))]
> > > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> > > +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> > > +                                             CONST0_RTX (<ssepackmode>mode),
> > > +                                             operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> > > +  [(set (match_operand:V2DI 0 "register_operand")
> > > +       (vec_concat:V2DI
> > > +         (zero_extend:DI
> > > +           (match_operand:SI 1 "nonimmediate_operand"))
> > > +         (const_int 0)))]
> > > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (V4SImode);
> > > +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (V2DImode, dest, V4SImode));
> > > +  DONE;
> > > +})
> > > +
> > >  (define_insn "sse4_1_insertps"
> > >    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
> > >         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > > new file mode 100644
> > > index 00000000000..913ff8806f1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-mavx512f -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> > > +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> > > +
> > > +#include<immintrin.h>
> > > +
> > > +__m128i
> > > +foo1 (int* p)
> > > +{
> > > +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo3 (int* p)
> > > +{
> > > +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo5 (int* p)
> > > +{
> > > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > > new file mode 100644
> > > index 00000000000..ac47865d17a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > > @@ -0,0 +1,45 @@
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-mavx512fp16 -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> > > +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> > > +
> > > +#include<immintrin.h>
> > > +__m128i
> > > +foo (short* p)
> > > +{
> > > +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m128i
> > > +foo1 (short* p)
> > > +{
> > > +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo2 (short* p)
> > > +{
> > > +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo3 (short* p)
> > > +{
> > > +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo4 (short* p)
> > > +{
> > > +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> > > +                          0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo5 (short* p)
> > > +{
> > > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7b791def542..2ad8a2b46b8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -985,6 +985,15 @@  (define_mode_attr sseintvecmode
    (V32HI "V32HI") (V64QI "V64QI")
    (V32QI "V32QI") (V16QI "V16QI")])
 
+;; Mapping of vector modes to an V*HImode of the same size
+(define_mode_attr ssewvecmode
+  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
+   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
+
+(define_mode_attr ssewvecmodelower
+  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
+   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
+
 (define_mode_attr sseintvecmode2
   [(V8DF "XI") (V4DF "OI") (V2DF "TI")
    (V8SF "OI") (V4SF "TI")
@@ -1194,6 +1203,11 @@  (define_mode_attr ssepackmode
    (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
    (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
 
+(define_mode_attr ssepackmodelower
+  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
+   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
+   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
+
 ;; Mapping of the max integer size for xop rotate immediate constraint
 (define_mode_attr sserotatemax
   [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
@@ -10681,6 +10695,46 @@  (define_insn "vec_set<mode>_0"
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
+  [(set (match_operand:VI48_AVX512F 0 "register_operand")
+	(vec_merge:VI48_AVX512F
+	 (vec_duplicate:VI48_AVX512F
+	  (zero_extend:<ssescalarmode>
+	    (match_operand:HI 1 "nonimmediate_operand")))
+	 (match_operand:VI48_AVX512F 2 "const0_operand")
+	 (const_int 1)))]
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
+  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
+					      CONST0_RTX (<ssewvecmode>mode),
+					      operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
+  DONE;
+})
+
+(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (zero_extend:DI
+	    (match_operand:HI 1 "nonimmediate_operand"))
+	  (const_int 0)))]
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (V8HImode);
+  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (V2DImode, dest, V8HImode));
+  DONE;
+})
+
 (define_insn "avx512fp16_movsh"
   [(set (match_operand:V8HF 0 "register_operand" "=v")
 	(vec_merge:V8HF
@@ -10750,6 +10804,46 @@  (define_insn "vec_set<mode>_0"
 	   ]
 	   (symbol_ref "true")))])
 
+(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
+  [(set (match_operand:VI8 0 "register_operand")
+	(vec_merge:VI8
+	 (vec_duplicate:VI8
+	  (zero_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand")))
+	 (match_operand:VI8 2 "const0_operand")
+	 (const_int 1)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (<ssepackmode>mode);
+  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
+					      CONST0_RTX (<ssepackmode>mode),
+					      operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
+  DONE;
+})
+
+(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (zero_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand"))
+	  (const_int 0)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (V4SImode);
+  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (V2DImode, dest, V4SImode));
+  DONE;
+})
+
 (define_insn "sse4_1_insertps"
   [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
 	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
new file mode 100644
index 00000000000..913ff8806f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
+/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
+
+#include<immintrin.h>
+
+__m128i
+foo1 (int* p)
+{
+  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
+}
+
+__m256i
+foo3 (int* p)
+{
+  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u *)p)[0]));
+}
+
+__m512i
+foo5 (int* p)
+{
+  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
+			   (unsigned int) ((*(__m32_u *)p)[0]));
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
new file mode 100644
index 00000000000..ac47865d17a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
@@ -0,0 +1,45 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx512fp16 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
+/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
+
+#include<immintrin.h>
+__m128i
+foo (short* p)
+{
+  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m128i
+foo1 (short* p)
+{
+  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m256i
+foo2 (short* p)
+{
+  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
+			   (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m256i
+foo3 (short* p)
+{
+  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m512i
+foo4 (short* p)
+{
+  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
+			   0, 0, 0, 0, 0, 0, 0,
+			   (unsigned short) ((*(__m16_u *)p)[0]));
+}
+
+__m512i
+foo5 (short* p)
+{
+  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
+			   (unsigned short) ((*(__m16_u *)p)[0]));
+}