Message ID | 20220513071602.91413-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Optimize vpermtiw/b to vpunpcklqdq for certain cases. | expand |
On Fri, May 13, 2022 at 9:16 AM liuhongt <hongtao.liu@intel.com> wrote: > > Assembly Optimization like: > - vmovq %xmm0, %xmm2 > - vmovdqa .LC0(%rip), %xmm0 > vmovq %xmm1, %xmm1 > - vpermi2w %xmm1, %xmm2, %xmm0 > + vmovq %xmm0, %xmm0 > + vpunpcklqdq %xmm1, %xmm0, %xmm0 > > ... > > -.LC0: > - .value 0 > - .value 1 > - .value 2 > - .value 3 > - .value 8 > - .value 9 > - .value 10 > - .value 11 > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > PR target/105033 > * config/i386/sse.md (*vec_concatv4si): Extend to .. > (*vec_concat<mode>): .. V16QI and V8HImode. > (*vec_concatv16qi_permt2): New pre_reload define_insn_and_split. > (*vec_concatv8hi_permt2): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr105033.c: New test. > --- > gcc/config/i386/sse.md | 62 ++++++++++++++++++++++-- > gcc/testsuite/gcc.target/i386/pr105033.c | 27 +++++++++++ > 2 files changed, 84 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr105033.c > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index a63df0d0b1f..2e417e47d20 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -19600,11 +19600,11 @@ (define_insn "*vec_concatv2si" > (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov") > (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")]) > > -(define_insn "*vec_concatv4si" > - [(set (match_operand:V4SI 0 "register_operand" "=x,v,x,x,v") > - (vec_concat:V4SI > - (match_operand:V2SI 1 "register_operand" " 0,v,0,0,v") > - (match_operand:V2SI 2 "nonimmediate_operand" " x,v,x,m,m")))] > +(define_insn "*vec_concat<mode>" > + [(set (match_operand:VI124_128 0 "register_operand" "=x,v,x,x,v") > + (vec_concat:VI124_128 > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,0,v") > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,x,m,m")))] > "TARGET_SSE" > "@ > punpcklqdq\t{%2, %0|%0, %2} > @@ -19617,6 +19617,58 @@ (define_insn "*vec_concatv4si" > (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") > (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) > > +(define_insn_and_split "*vec_concatv16qi_permt2" > + [(set (match_operand:V16QI 0 "register_operand") > + (unspec:V16QI > + [(const_vector:V16QI [(const_int 0) (const_int 1) > + (const_int 2) (const_int 3) > + (const_int 4) (const_int 5) > + (const_int 6) (const_int 7) > + (const_int 16) (const_int 17) > + (const_int 18) (const_int 19) > + (const_int 20) (const_int 21) > + (const_int 22) (const_int 23)]) > + (match_operand:V16QI 1 "register_operand") > + (match_operand:V16QI 2 "nonimmediate_operand")] > + UNSPEC_VPERMT2))] > + "TARGET_AVX512VL && TARGET_AVX512VBMI" You need "&& ix86_pre_reload_split ()" here, because a pseudo can be generated via force_reg. > + "#" > + "&& 1" > + [(set (match_dup 0) > + (vec_concat:V16QI (match_dup 1) (match_dup 2)))] > +{ > + operands[1] = lowpart_subreg (V8QImode, > + force_reg (V16QImode, operands[1]), > + V16QImode); > + if (!MEM_P (operands[2])) > + operands[2] = force_reg (V16QImode, operands[2]); Are you sure there are no subregs possible in operand[2]? To stay on the safe side, use force_reg unconditionally, it will also force subregs to reg, avoiding failure with the following lowpart_subreg. Uros. > + operands[2] = lowpart_subreg (V8QImode, operands[2], V16QImode); > +}) > + > +(define_insn_and_split "*vec_concatv8hi_permt2" > + [(set (match_operand:V8HI 0 "register_operand") > + (unspec:V8HI > + [(const_vector:V8HI [(const_int 0) (const_int 1) > + (const_int 2) (const_int 3) > + (const_int 8) (const_int 9) > + (const_int 10) (const_int 11)]) > + (match_operand:V8HI 1 "register_operand") > + (match_operand:V8HI 2 "nonimmediate_operand")] > + UNSPEC_VPERMT2))] > + "TARGET_AVX512VL && TARGET_AVX512BW" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (vec_concat:V8HI (match_dup 1) (match_dup 2)))] > +{ > + operands[1] = lowpart_subreg (V4HImode, > + force_reg (V8HImode, operands[1]), > + V8HImode); > + if (!MEM_P (operands[2])) > + operands[2] = force_reg (V8HImode, operands[2]); > + operands[2] = lowpart_subreg (V4HImode, operands[2], V8HImode); > +}) > + > (define_insn "*vec_concat<mode>_0" > [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > (vec_concat:VI124_128 > diff --git a/gcc/testsuite/gcc.target/i386/pr105033.c b/gcc/testsuite/gcc.target/i386/pr105033.c > new file mode 100644 > index 00000000000..ab05e3b3bc8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr105033.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=sapphirerapids -O2" } */ > +/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 } } */ > +/* { dg-final { scan-assembler-not {vpermi2[wb][ \t]+} } } */ > + > +typedef _Float16 v8hf __attribute__((vector_size (16))); > +typedef _Float16 v4hf __attribute__((vector_size (8))); > +typedef short v8hi __attribute__((vector_size (16))); > +typedef short v4hi __attribute__((vector_size (8))); > +typedef char v16qi __attribute__((vector_size (16))); > +typedef char v8qi __attribute__((vector_size (8))); > + > +v8hf foo (v4hf a, v4hf b) > +{ > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); > +} > + > +v8hi foo2 (v4hi a, v4hi b) > +{ > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); > +} > + > +v16qi foo3 (v8qi a, v8qi b) > +{ > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7, > + 8, 9, 10, 11, 12, 13, 14, 15); > +} > -- > 2.18.1 >
> -----Original Message----- > From: Uros Bizjak <ubizjak@gmail.com> > Sent: Friday, May 13, 2022 4:15 PM > To: Liu, Hongtao <hongtao.liu@intel.com> > Cc: gcc-patches@gcc.gnu.org > Subject: Re: [PATCH] Optimize vpermtiw/b to vpunpcklqdq for certain cases. > > On Fri, May 13, 2022 at 9:16 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > Assembly Optimization like: > > - vmovq %xmm0, %xmm2 > > - vmovdqa .LC0(%rip), %xmm0 > > vmovq %xmm1, %xmm1 > > - vpermi2w %xmm1, %xmm2, %xmm0 > > + vmovq %xmm0, %xmm0 > > + vpunpcklqdq %xmm1, %xmm0, %xmm0 > > > > ... > > > > -.LC0: > > - .value 0 > > - .value 1 > > - .value 2 > > - .value 3 > > - .value 8 > > - .value 9 > > - .value 10 > > - .value 11 > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > > > gcc/ChangeLog: > > > > PR target/105033 > > * config/i386/sse.md (*vec_concatv4si): Extend to .. > > (*vec_concat<mode>): .. V16QI and V8HImode. > > (*vec_concatv16qi_permt2): New pre_reload define_insn_and_split. > > (*vec_concatv8hi_permt2): Ditto. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr105033.c: New test. > > --- > > gcc/config/i386/sse.md | 62 ++++++++++++++++++++++-- > > gcc/testsuite/gcc.target/i386/pr105033.c | 27 +++++++++++ > > 2 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 > > gcc/testsuite/gcc.target/i386/pr105033.c > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > > a63df0d0b1f..2e417e47d20 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -19600,11 +19600,11 @@ (define_insn "*vec_concatv2si" > > (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov") > > (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")]) > > > > -(define_insn "*vec_concatv4si" > > - [(set (match_operand:V4SI 0 "register_operand" "=x,v,x,x,v") > > - (vec_concat:V4SI > > - (match_operand:V2SI 1 "register_operand" " 0,v,0,0,v") > > - (match_operand:V2SI 2 "nonimmediate_operand" " x,v,x,m,m")))] > > +(define_insn "*vec_concat<mode>" > > + [(set (match_operand:VI124_128 0 "register_operand" "=x,v,x,x,v") > > + (vec_concat:VI124_128 > > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,0,v") > > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " > > +x,v,x,m,m")))] > > "TARGET_SSE" > > "@ > > punpcklqdq\t{%2, %0|%0, %2} > > @@ -19617,6 +19617,58 @@ (define_insn "*vec_concatv4si" > > (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") > > (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) > > > > +(define_insn_and_split "*vec_concatv16qi_permt2" > > + [(set (match_operand:V16QI 0 "register_operand") > > + (unspec:V16QI > > + [(const_vector:V16QI [(const_int 0) (const_int 1) > > + (const_int 2) (const_int 3) > > + (const_int 4) (const_int 5) > > + (const_int 6) (const_int 7) > > + (const_int 16) (const_int 17) > > + (const_int 18) (const_int 19) > > + (const_int 20) (const_int 21) > > + (const_int 22) (const_int 23)]) > > + (match_operand:V16QI 1 "register_operand") > > + (match_operand:V16QI 2 "nonimmediate_operand")] > > + UNSPEC_VPERMT2))] > > + "TARGET_AVX512VL && TARGET_AVX512VBMI" > > You need "&& ix86_pre_reload_split ()" here, because a pseudo can be > generated via force_reg. > will change. > > + "#" > > + "&& 1" > > + [(set (match_dup 0) > > + (vec_concat:V16QI (match_dup 1) (match_dup 2)))] { > > + operands[1] = lowpart_subreg (V8QImode, > > + force_reg (V16QImode, operands[1]), > > + V16QImode); > > + if (!MEM_P (operands[2])) > > + operands[2] = force_reg (V16QImode, operands[2]); > > Are you sure there are no subregs possible in operand[2]? To stay on the safe > side, use force_reg unconditionally, it will also force subregs to reg, avoiding > failure with the following lowpart_subreg. When it's MEM, not need to force_reg. > > Uros. > > > + operands[2] = lowpart_subreg (V8QImode, operands[2], V16QImode); > > +}) > > + > > +(define_insn_and_split "*vec_concatv8hi_permt2" > > + [(set (match_operand:V8HI 0 "register_operand") > > + (unspec:V8HI > > + [(const_vector:V8HI [(const_int 0) (const_int 1) > > + (const_int 2) (const_int 3) > > + (const_int 8) (const_int 9) > > + (const_int 10) (const_int 11)]) > > + (match_operand:V8HI 1 "register_operand") > > + (match_operand:V8HI 2 "nonimmediate_operand")] > > + UNSPEC_VPERMT2))] > > + "TARGET_AVX512VL && TARGET_AVX512BW" > > + "#" > > + "&& 1" > > + [(set (match_dup 0) > > + (vec_concat:V8HI (match_dup 1) (match_dup 2)))] { > > + operands[1] = lowpart_subreg (V4HImode, > > + force_reg (V8HImode, operands[1]), > > + V8HImode); > > + if (!MEM_P (operands[2])) > > + operands[2] = force_reg (V8HImode, operands[2]); > > + operands[2] = lowpart_subreg (V4HImode, operands[2], V8HImode); > > +}) > > + > > (define_insn "*vec_concat<mode>_0" > > [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > > (vec_concat:VI124_128 > > diff --git a/gcc/testsuite/gcc.target/i386/pr105033.c > > b/gcc/testsuite/gcc.target/i386/pr105033.c > > new file mode 100644 > > index 00000000000..ab05e3b3bc8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr105033.c > > @@ -0,0 +1,27 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=sapphirerapids -O2" } */ > > +/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 } } */ > > +/* { dg-final { scan-assembler-not {vpermi2[wb][ \t]+} } } */ > > + > > +typedef _Float16 v8hf __attribute__((vector_size (16))); typedef > > +_Float16 v4hf __attribute__((vector_size (8))); typedef short v8hi > > +__attribute__((vector_size (16))); typedef short v4hi > > +__attribute__((vector_size (8))); typedef char v16qi > > +__attribute__((vector_size (16))); typedef char v8qi > > +__attribute__((vector_size (8))); > > + > > +v8hf foo (v4hf a, v4hf b) > > +{ > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } > > + > > +v8hi foo2 (v4hi a, v4hi b) > > +{ > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } > > + > > +v16qi foo3 (v8qi a, v8qi b) > > +{ > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7, > > + 8, 9, 10, 11, 12, 13, 14, 15); } > > -- > > 2.18.1 > >
On Fri, May 13, 2022 at 10:54 AM Liu, Hongtao <hongtao.liu@intel.com> wrote: > > > > > -----Original Message----- > > From: Uros Bizjak <ubizjak@gmail.com> > > Sent: Friday, May 13, 2022 4:15 PM > > To: Liu, Hongtao <hongtao.liu@intel.com> > > Cc: gcc-patches@gcc.gnu.org > > Subject: Re: [PATCH] Optimize vpermtiw/b to vpunpcklqdq for certain cases. > > > > On Fri, May 13, 2022 at 9:16 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > Assembly Optimization like: > > > - vmovq %xmm0, %xmm2 > > > - vmovdqa .LC0(%rip), %xmm0 > > > vmovq %xmm1, %xmm1 > > > - vpermi2w %xmm1, %xmm2, %xmm0 > > > + vmovq %xmm0, %xmm0 > > > + vpunpcklqdq %xmm1, %xmm0, %xmm0 > > > > > > ... > > > > > > -.LC0: > > > - .value 0 > > > - .value 1 > > > - .value 2 > > > - .value 3 > > > - .value 8 > > > - .value 9 > > > - .value 10 > > > - .value 11 > > > > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > > Ok for trunk? > > > > > > gcc/ChangeLog: > > > > > > PR target/105033 > > > * config/i386/sse.md (*vec_concatv4si): Extend to .. > > > (*vec_concat<mode>): .. V16QI and V8HImode. > > > (*vec_concatv16qi_permt2): New pre_reload define_insn_and_split. > > > (*vec_concatv8hi_permt2): Ditto. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/pr105033.c: New test. > > > --- > > > gcc/config/i386/sse.md | 62 ++++++++++++++++++++++-- > > > gcc/testsuite/gcc.target/i386/pr105033.c | 27 +++++++++++ > > > 2 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 > > > gcc/testsuite/gcc.target/i386/pr105033.c > > > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > > > a63df0d0b1f..2e417e47d20 100644 > > > --- a/gcc/config/i386/sse.md > > > +++ b/gcc/config/i386/sse.md > > > @@ -19600,11 +19600,11 @@ (define_insn "*vec_concatv2si" > > > (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov") > > > (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")]) > > > > > > -(define_insn "*vec_concatv4si" > > > - [(set (match_operand:V4SI 0 "register_operand" "=x,v,x,x,v") > > > - (vec_concat:V4SI > > > - (match_operand:V2SI 1 "register_operand" " 0,v,0,0,v") > > > - (match_operand:V2SI 2 "nonimmediate_operand" " x,v,x,m,m")))] > > > +(define_insn "*vec_concat<mode>" > > > + [(set (match_operand:VI124_128 0 "register_operand" "=x,v,x,x,v") > > > + (vec_concat:VI124_128 > > > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,0,v") > > > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " > > > +x,v,x,m,m")))] > > > "TARGET_SSE" > > > "@ > > > punpcklqdq\t{%2, %0|%0, %2} > > > @@ -19617,6 +19617,58 @@ (define_insn "*vec_concatv4si" > > > (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") > > > (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) > > > > > > +(define_insn_and_split "*vec_concatv16qi_permt2" > > > + [(set (match_operand:V16QI 0 "register_operand") > > > + (unspec:V16QI > > > + [(const_vector:V16QI [(const_int 0) (const_int 1) > > > + (const_int 2) (const_int 3) > > > + (const_int 4) (const_int 5) > > > + (const_int 6) (const_int 7) > > > + (const_int 16) (const_int 17) > > > + (const_int 18) (const_int 19) > > > + (const_int 20) (const_int 21) > > > + (const_int 22) (const_int 23)]) > > > + (match_operand:V16QI 1 "register_operand") > > > + (match_operand:V16QI 2 "nonimmediate_operand")] > > > + UNSPEC_VPERMT2))] > > > + "TARGET_AVX512VL && TARGET_AVX512VBMI" > > > > You need "&& ix86_pre_reload_split ()" here, because a pseudo can be > > generated via force_reg. > > > will change. > > > + "#" > > > + "&& 1" > > > + [(set (match_dup 0) > > > + (vec_concat:V16QI (match_dup 1) (match_dup 2)))] { > > > + operands[1] = lowpart_subreg (V8QImode, > > > + force_reg (V16QImode, operands[1]), > > > + V16QImode); > > > + if (!MEM_P (operands[2])) > > > + operands[2] = force_reg (V16QImode, operands[2]); > > > > Are you sure there are no subregs possible in operand[2]? To stay on the safe > > side, use force_reg unconditionally, it will also force subregs to reg, avoiding > > failure with the following lowpart_subreg. > When it's MEM, not need to force_reg. Ah, I misread this. Uros. > > > > Uros. > > > > > + operands[2] = lowpart_subreg (V8QImode, operands[2], V16QImode); > > > +}) > > > + > > > +(define_insn_and_split "*vec_concatv8hi_permt2" > > > + [(set (match_operand:V8HI 0 "register_operand") > > > + (unspec:V8HI > > > + [(const_vector:V8HI [(const_int 0) (const_int 1) > > > + (const_int 2) (const_int 3) > > > + (const_int 8) (const_int 9) > > > + (const_int 10) (const_int 11)]) > > > + (match_operand:V8HI 1 "register_operand") > > > + (match_operand:V8HI 2 "nonimmediate_operand")] > > > + UNSPEC_VPERMT2))] > > > + "TARGET_AVX512VL && TARGET_AVX512BW" > > > + "#" > > > + "&& 1" > > > + [(set (match_dup 0) > > > + (vec_concat:V8HI (match_dup 1) (match_dup 2)))] { > > > + operands[1] = lowpart_subreg (V4HImode, > > > + force_reg (V8HImode, operands[1]), > > > + V8HImode); > > > + if (!MEM_P (operands[2])) > > > + operands[2] = force_reg (V8HImode, operands[2]); > > > + operands[2] = lowpart_subreg (V4HImode, operands[2], V8HImode); > > > +}) > > > + > > > (define_insn "*vec_concat<mode>_0" > > > [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > > > (vec_concat:VI124_128 > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105033.c > > > b/gcc/testsuite/gcc.target/i386/pr105033.c > > > new file mode 100644 > > > index 00000000000..ab05e3b3bc8 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr105033.c > > > @@ -0,0 +1,27 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-march=sapphirerapids -O2" } */ > > > +/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 } } */ > > > +/* { dg-final { scan-assembler-not {vpermi2[wb][ \t]+} } } */ > > > + > > > +typedef _Float16 v8hf __attribute__((vector_size (16))); typedef > > > +_Float16 v4hf __attribute__((vector_size (8))); typedef short v8hi > > > +__attribute__((vector_size (16))); typedef short v4hi > > > +__attribute__((vector_size (8))); typedef char v16qi > > > +__attribute__((vector_size (16))); typedef char v8qi > > > +__attribute__((vector_size (8))); > > > + > > > +v8hf foo (v4hf a, v4hf b) > > > +{ > > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } > > > + > > > +v8hi foo2 (v4hi a, v4hi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } > > > + > > > +v16qi foo3 (v8qi a, v8qi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7, > > > + 8, 9, 10, 11, 12, 13, 14, 15); } > > > -- > > > 2.18.1 > > >
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a63df0d0b1f..2e417e47d20 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19600,11 +19600,11 @@ (define_insn "*vec_concatv2si" (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov") (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")]) -(define_insn "*vec_concatv4si" - [(set (match_operand:V4SI 0 "register_operand" "=x,v,x,x,v") - (vec_concat:V4SI - (match_operand:V2SI 1 "register_operand" " 0,v,0,0,v") - (match_operand:V2SI 2 "nonimmediate_operand" " x,v,x,m,m")))] +(define_insn "*vec_concat<mode>" + [(set (match_operand:VI124_128 0 "register_operand" "=x,v,x,x,v") + (vec_concat:VI124_128 + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,0,v") + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,x,m,m")))] "TARGET_SSE" "@ punpcklqdq\t{%2, %0|%0, %2} @@ -19617,6 +19617,58 @@ (define_insn "*vec_concatv4si" (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) +(define_insn_and_split "*vec_concatv16qi_permt2" + [(set (match_operand:V16QI 0 "register_operand") + (unspec:V16QI + [(const_vector:V16QI [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7) + (const_int 16) (const_int 17) + (const_int 18) (const_int 19) + (const_int 20) (const_int 21) + (const_int 22) (const_int 23)]) + (match_operand:V16QI 1 "register_operand") + (match_operand:V16QI 2 "nonimmediate_operand")] + UNSPEC_VPERMT2))] + "TARGET_AVX512VL && TARGET_AVX512VBMI" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat:V16QI (match_dup 1) (match_dup 2)))] +{ + operands[1] = lowpart_subreg (V8QImode, + force_reg (V16QImode, operands[1]), + V16QImode); + if (!MEM_P (operands[2])) + operands[2] = force_reg (V16QImode, operands[2]); + operands[2] = lowpart_subreg (V8QImode, operands[2], V16QImode); +}) + +(define_insn_and_split "*vec_concatv8hi_permt2" + [(set (match_operand:V8HI 0 "register_operand") + (unspec:V8HI + [(const_vector:V8HI [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 8) (const_int 9) + (const_int 10) (const_int 11)]) + (match_operand:V8HI 1 "register_operand") + (match_operand:V8HI 2 "nonimmediate_operand")] + UNSPEC_VPERMT2))] + "TARGET_AVX512VL && TARGET_AVX512BW" + "#" + "&& 1" + [(set (match_dup 0) + (vec_concat:V8HI (match_dup 1) (match_dup 2)))] +{ + operands[1] = lowpart_subreg (V4HImode, + force_reg (V8HImode, operands[1]), + V8HImode); + if (!MEM_P (operands[2])) + operands[2] = force_reg (V8HImode, operands[2]); + operands[2] = lowpart_subreg (V4HImode, operands[2], V8HImode); +}) + (define_insn "*vec_concat<mode>_0" [(set (match_operand:VI124_128 0 "register_operand" "=v,x") (vec_concat:VI124_128 diff --git a/gcc/testsuite/gcc.target/i386/pr105033.c b/gcc/testsuite/gcc.target/i386/pr105033.c new file mode 100644 index 00000000000..ab05e3b3bc8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr105033.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-march=sapphirerapids -O2" } */ +/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 } } */ +/* { dg-final { scan-assembler-not {vpermi2[wb][ \t]+} } } */ + +typedef _Float16 v8hf __attribute__((vector_size (16))); +typedef _Float16 v4hf __attribute__((vector_size (8))); +typedef short v8hi __attribute__((vector_size (16))); +typedef short v4hi __attribute__((vector_size (8))); +typedef char v16qi __attribute__((vector_size (16))); +typedef char v8qi __attribute__((vector_size (8))); + +v8hf foo (v4hf a, v4hf b) +{ + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); +} + +v8hi foo2 (v4hi a, v4hi b) +{ + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); +} + +v16qi foo3 (v8qi a, v8qi b) +{ + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); +}