diff mbox series

[V2,i386] Optimize v4si broadcast for noavx512vl.

Message ID 20220307051116.47819-1-hongtao.liu@intel.com
State New
Headers show
Series [V2,i386] Optimize v4si broadcast for noavx512vl. | expand

Commit Message

Liu, Hongtao March 7, 2022, 5:11 a.m. UTC
>What happens if you set preferred_for_speed to false for alternative 1?
It works, and I've removed the newly added splitter in this patch.
Also i tried to do similar things to *vec_dup<mode> with mode iterator AVX2_VEC_DUP_MODE, but it hit ICE during reload since x86 don't have direct move for QImode from gpr to sse register. so in this patch i only handle *vec_dupv4si.

>> +(define_split
>> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
>> +       (vec_duplicate:V4SI
>> +         (match_operand:SI 1 "general_reg_operand")))]
>> +  "TARGET_SSE && reload_completed
>> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
>> +      available, because then we can broadcast from GPRs directly.  */

>I think avx512vl_vec_dup_gprv4si should be merged with the above
>pattern instead.
Remove this splitter.

This will enable below

-       vbroadcastss    .LC1(%rip), %xmm0
+       movl    $-45, %edx
+       vmovd   %edx, %xmm0
+       vpshufd $0, %xmm0, %xmm0

According to microbenchmark, it's faster than broadcast from memory
for TARGET_INTER_UNIT_MOVES_TO_VEC.

gcc/ChangeLog:

	* config/i386/sse.md (*vec_dupv4si): Disable memory operand
	for !TARGET_INTER_UNIT_MOVES_TO_VEC when prefer_for_speed.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr100865-8a.c: Adjust testcase.
	* gcc.target/i386/pr100865-8c.c: Ditto.
	* gcc.target/i386/pr100865-9c.c: Ditto.
---
 gcc/config/i386/sse.md                      | 7 ++++++-
 gcc/testsuite/gcc.target/i386/pr100865-8a.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr100865-8c.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr100865-9c.c | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

Comments

Hongtao Liu March 7, 2022, 5:15 a.m. UTC | #1
Met some problem in git send-email --cc=a,b,c, so manually CC.

On Mon, Mar 7, 2022 at 1:11 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> >What happens if you set preferred_for_speed to false for alternative 1?
> It works, and I've removed the newly added splitter in this patch.
> Also i tried to do similar things to *vec_dup<mode> with mode iterator AVX2_VEC_DUP_MODE, but it hit ICE during reload since x86 don't have direct move for QImode from gpr to sse register. so in this patch i only handle *vec_dupv4si.
>
> >> +(define_split
> >> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
> >> +       (vec_duplicate:V4SI
> >> +         (match_operand:SI 1 "general_reg_operand")))]
> >> +  "TARGET_SSE && reload_completed
> >> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
> >> +      available, because then we can broadcast from GPRs directly.  */
>
> >I think avx512vl_vec_dup_gprv4si should be merged with the above
> >pattern instead.
> Remove this splitter.
>
> This will enable below
>
> -       vbroadcastss    .LC1(%rip), %xmm0
> +       movl    $-45, %edx
> +       vmovd   %edx, %xmm0
> +       vpshufd $0, %xmm0, %xmm0
>
> According to microbenchmark, it's faster than broadcast from memory
> for TARGET_INTER_UNIT_MOVES_TO_VEC.
>
> gcc/ChangeLog:
>
>         * config/i386/sse.md (*vec_dupv4si): Disable memory operand
>         for !TARGET_INTER_UNIT_MOVES_TO_VEC when prefer_for_speed.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr100865-8a.c: Adjust testcase.
>         * gcc.target/i386/pr100865-8c.c: Ditto.
>         * gcc.target/i386/pr100865-9c.c: Ditto.
> ---
>  gcc/config/i386/sse.md                      | 7 ++++++-
>  gcc/testsuite/gcc.target/i386/pr100865-8a.c | 2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-8c.c | 2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-9c.c | 2 +-
>  4 files changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 3066ea3734a..a091853065e 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -25134,7 +25134,12 @@ (define_insn "*vec_dupv4si"
>     (set_attr "length_immediate" "1,0,1")
>     (set_attr "prefix_extra" "0,1,*")
>     (set_attr "prefix" "maybe_vex,maybe_evex,orig")
> -   (set_attr "mode" "TI,V4SF,V4SF")])
> +   (set_attr "mode" "TI,V4SF,V4SF")
> +   (set (attr "preferred_for_speed")
> +     (cond [(eq_attr "alternative" "1")
> +             (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC")
> +          ]
> +          (symbol_ref "true")))])
>
>  (define_insn "*vec_dupv2di"
>    [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> index 911b14d4a25..544a14db6f7 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> @@ -20,5 +20,5 @@ foo (void)
>      array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
>  }
>
> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> index 00682edb8c9..efee0488614 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-8a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> index 8ffcdc1629d..e6f25902c1d 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-9a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> --
> 2.18.1
>
Uros Bizjak March 7, 2022, 11:22 a.m. UTC | #2
On Mon, Mar 7, 2022 at 6:11 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> >What happens if you set preferred_for_speed to false for alternative 1?
> It works, and I've removed the newly added splitter in this patch.
> Also i tried to do similar things to *vec_dup<mode> with mode iterator AVX2_VEC_DUP_MODE, but it hit ICE during reload since x86 don't have direct move for QImode from gpr to sse register. so in this patch i only handle *vec_dupv4si.
>
> >> +(define_split
> >> +  [(set (match_operand:V4SI 0 "sse_reg_operand")
> >> +       (vec_duplicate:V4SI
> >> +         (match_operand:SI 1 "general_reg_operand")))]
> >> +  "TARGET_SSE && reload_completed
> >> +   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
> >> +      available, because then we can broadcast from GPRs directly.  */
>
> >I think avx512vl_vec_dup_gprv4si should be merged with the above
> >pattern instead.
> Remove this splitter.
>
> This will enable below
>
> -       vbroadcastss    .LC1(%rip), %xmm0
> +       movl    $-45, %edx
> +       vmovd   %edx, %xmm0
> +       vpshufd $0, %xmm0, %xmm0
>
> According to microbenchmark, it's faster than broadcast from memory
> for TARGET_INTER_UNIT_MOVES_TO_VEC.
>
> gcc/ChangeLog:
>
>         * config/i386/sse.md (*vec_dupv4si): Disable memory operand
>         for !TARGET_INTER_UNIT_MOVES_TO_VEC when prefer_for_speed.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr100865-8a.c: Adjust testcase.
>         * gcc.target/i386/pr100865-8c.c: Ditto.
>         * gcc.target/i386/pr100865-9c.c: Ditto.

LGTM.

Thanks,
Uros.

> ---
>  gcc/config/i386/sse.md                      | 7 ++++++-
>  gcc/testsuite/gcc.target/i386/pr100865-8a.c | 2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-8c.c | 2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-9c.c | 2 +-
>  4 files changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 3066ea3734a..a091853065e 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -25134,7 +25134,12 @@ (define_insn "*vec_dupv4si"
>     (set_attr "length_immediate" "1,0,1")
>     (set_attr "prefix_extra" "0,1,*")
>     (set_attr "prefix" "maybe_vex,maybe_evex,orig")
> -   (set_attr "mode" "TI,V4SF,V4SF")])
> +   (set_attr "mode" "TI,V4SF,V4SF")
> +   (set (attr "preferred_for_speed")
> +     (cond [(eq_attr "alternative" "1")
> +             (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC")
> +          ]
> +          (symbol_ref "true")))])
>
>  (define_insn "*vec_dupv2di"
>    [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> index 911b14d4a25..544a14db6f7 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
> @@ -20,5 +20,5 @@ foo (void)
>      array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
>  }
>
> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> index 00682edb8c9..efee0488614 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-8a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> index 8ffcdc1629d..e6f25902c1d 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-9a.c"
>
> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> --
> 2.18.1
>
diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3066ea3734a..a091853065e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -25134,7 +25134,12 @@  (define_insn "*vec_dupv4si"
    (set_attr "length_immediate" "1,0,1")
    (set_attr "prefix_extra" "0,1,*")
    (set_attr "prefix" "maybe_vex,maybe_evex,orig")
-   (set_attr "mode" "TI,V4SF,V4SF")])
+   (set_attr "mode" "TI,V4SF,V4SF")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "1")
+	      (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC")
+	   ]
+	   (symbol_ref "true")))])
 
 (define_insn "*vec_dupv2di"
   [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
index 911b14d4a25..544a14db6f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
@@ -20,5 +20,5 @@  foo (void)
     array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
 }
 
-/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
index 00682edb8c9..efee0488614 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
@@ -3,5 +3,5 @@ 
 
 #include "pr100865-8a.c"
 
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
index 8ffcdc1629d..e6f25902c1d 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
@@ -3,5 +3,5 @@ 
 
 #include "pr100865-9a.c"
 
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */