diff mbox series

[PATCHv2,rs6000] Optimize vector construction with two vector doubleword loads [PR103568]

Message ID 8b8d0cdc-7725-4cb6-a31f-257392101b7a@linux.ibm.com
State New
Headers show
Series [PATCHv2,rs6000] Optimize vector construction with two vector doubleword loads [PR103568] | expand

Commit Message

HAO CHEN GUI May 31, 2024, 3:25 a.m. UTC
Hi,
  This patch optimizes vector construction with two vector doubleword loads.
It generates an optimal insn sequence as "xxlor" has lower latency than
"mtvsrdd" on Power10.

  Compared with previous version, the main change is to use "isa" attribute
to guard "lxsd" and "lxsdx".
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no
regressions. OK for the trunk?

Thanks
Gui Haochen

ChangeLog
rs6000: Optimize vector construction with two vector doubleword loads

When constructing a vector by two doublewords from memory, originally it
does
	ld 10,0(3)
	ld 9,0(4)
	mtvsrdd 34,9,10

An optimal sequence on Power10 should be
	lxsd 0,0(4)
	lxvrdx 1,0,3
	xxlor 34,1,32

This patch does this optimization by insn combine and split.

gcc/
	PR target/103568
	* config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn
	pattern.
	(vsx_ld_highpart_zero_<mode>): New insn pattern.
	(vsx_concat_mem_<mode>): New insn_and_split pattern.

gcc/testsuite/
	PR target/103568
	* gcc.target/powerpc/pr103568.c: New test.

patch.diff

Comments

HAO CHEN GUI June 20, 2024, 7:01 a.m. UTC | #1
Hi,
 Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html

Thanks
Gui Haochen

在 2024/5/31 11:25, HAO CHEN GUI 写道:
> Hi,
>   This patch optimizes vector construction with two vector doubleword loads.
> It generates an optimal insn sequence as "xxlor" has lower latency than
> "mtvsrdd" on Power10.
> 
>   Compared with previous version, the main change is to use "isa" attribute
> to guard "lxsd" and "lxsdx".
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no
> regressions. OK for the trunk?
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> rs6000: Optimize vector construction with two vector doubleword loads
> 
> When constructing a vector by two doublewords from memory, originally it
> does
> 	ld 10,0(3)
> 	ld 9,0(4)
> 	mtvsrdd 34,9,10
> 
> An optimal sequence on Power10 should be
> 	lxsd 0,0(4)
> 	lxvrdx 1,0,3
> 	xxlor 34,1,32
> 
> This patch does this optimization by insn combine and split.
> 
> gcc/
> 	PR target/103568
> 	* config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn
> 	pattern.
> 	(vsx_ld_highpart_zero_<mode>): New insn pattern.
> 	(vsx_concat_mem_<mode>): New insn_and_split pattern.
> 
> gcc/testsuite/
> 	PR target/103568
> 	* gcc.target/powerpc/pr103568.c: New test.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index f135fa079bd..f9a2a260e89 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di"
>    "lxvd2x %x0,%y1"
>    [(set_attr "type" "vecload")])
> 
> +(define_insn "vsx_ld_lowpart_zero_<mode>"
> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
> +	(vec_concat:VSX_D
> +	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
> +	  (match_operand:<VEC_base> 2 "zero_constant" "j,j")))]
> +  ""
> +  "@
> +   lxsd %0,%1
> +   lxsdx %x0,%y1"
> +  [(set_attr "type" "vecload,vecload")
> +   (set_attr "isa" "p9v,p7v")])
> +
> +(define_insn "vsx_ld_highpart_zero_<mode>"
> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
> +	(vec_concat:VSX_D
> +	  (match_operand:<VEC_base> 1 "zero_constant" "j")
> +	  (match_operand:<VEC_base> 2 "memory_operand" "Z")))]
> +  "TARGET_POWER10"
> +  "lxvrdx %x0,%y2"
> +  [(set_attr "type" "vecload")])
> +
>  (define_insn "vsx_ld_elemrev_v1ti"
>    [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
>          (vec_select:V1TI
> @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>"
>  }
>    [(set_attr "type" "vecperm,vecmove")])
> 
> +(define_insn_and_split "vsx_concat_mem_<mode>"
> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
> +	(vec_concat:VSX_D
> +	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
> +	  (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))]
> +  "TARGET_POWER10 && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx tmp1 = gen_reg_rtx (<MODE>mode);
> +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> +  emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode),
> +					      operands[1]));
> +  emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2],
> +					     CONST0_RTX (<VEC_base>mode)));
> +  emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2));
> +  DONE;
> +})
> +
>  ;; Combiner patterns to allow creating XXPERMDI's to access either double
>  ;; word element in a vector register.
>  (define_insn "*vsx_concat_<mode>_1"
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c
> new file mode 100644
> index 00000000000..b2a06fb2162
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +vector double test (double *a, double *b)
> +{
> +  return (vector double) {*a, *b};
> +}
> +
> +vector long long test1 (long long *a, long long *b)
> +{
> +  return (vector long long) {*a, *b};
> +}
> +
> +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */
> +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
> +
HAO CHEN GUI July 1, 2024, 1:41 a.m. UTC | #2
Hi,
 Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html

Thanks
Gui Haochen


在 2024/6/20 15:01, HAO CHEN GUI 写道:
> Hi,
>  Gently ping it.
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html
> 
> Thanks
> Gui Haochen
> 
> 在 2024/5/31 11:25, HAO CHEN GUI 写道:
>> Hi,
>>   This patch optimizes vector construction with two vector doubleword loads.
>> It generates an optimal insn sequence as "xxlor" has lower latency than
>> "mtvsrdd" on Power10.
>>
>>   Compared with previous version, the main change is to use "isa" attribute
>> to guard "lxsd" and "lxsdx".
>> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html
>>
>>   Bootstrapped and tested on powerpc64-linux BE and LE with no
>> regressions. OK for the trunk?
>>
>> Thanks
>> Gui Haochen
>>
>> ChangeLog
>> rs6000: Optimize vector construction with two vector doubleword loads
>>
>> When constructing a vector by two doublewords from memory, originally it
>> does
>> 	ld 10,0(3)
>> 	ld 9,0(4)
>> 	mtvsrdd 34,9,10
>>
>> An optimal sequence on Power10 should be
>> 	lxsd 0,0(4)
>> 	lxvrdx 1,0,3
>> 	xxlor 34,1,32
>>
>> This patch does this optimization by insn combine and split.
>>
>> gcc/
>> 	PR target/103568
>> 	* config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn
>> 	pattern.
>> 	(vsx_ld_highpart_zero_<mode>): New insn pattern.
>> 	(vsx_concat_mem_<mode>): New insn_and_split pattern.
>>
>> gcc/testsuite/
>> 	PR target/103568
>> 	* gcc.target/powerpc/pr103568.c: New test.
>>
>> patch.diff
>> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
>> index f135fa079bd..f9a2a260e89 100644
>> --- a/gcc/config/rs6000/vsx.md
>> +++ b/gcc/config/rs6000/vsx.md
>> @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di"
>>    "lxvd2x %x0,%y1"
>>    [(set_attr "type" "vecload")])
>>
>> +(define_insn "vsx_ld_lowpart_zero_<mode>"
>> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
>> +	(vec_concat:VSX_D
>> +	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
>> +	  (match_operand:<VEC_base> 2 "zero_constant" "j,j")))]
>> +  ""
>> +  "@
>> +   lxsd %0,%1
>> +   lxsdx %x0,%y1"
>> +  [(set_attr "type" "vecload,vecload")
>> +   (set_attr "isa" "p9v,p7v")])
>> +
>> +(define_insn "vsx_ld_highpart_zero_<mode>"
>> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
>> +	(vec_concat:VSX_D
>> +	  (match_operand:<VEC_base> 1 "zero_constant" "j")
>> +	  (match_operand:<VEC_base> 2 "memory_operand" "Z")))]
>> +  "TARGET_POWER10"
>> +  "lxvrdx %x0,%y2"
>> +  [(set_attr "type" "vecload")])
>> +
>>  (define_insn "vsx_ld_elemrev_v1ti"
>>    [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
>>          (vec_select:V1TI
>> @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>"
>>  }
>>    [(set_attr "type" "vecperm,vecmove")])
>>
>> +(define_insn_and_split "vsx_concat_mem_<mode>"
>> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
>> +	(vec_concat:VSX_D
>> +	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
>> +	  (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))]
>> +  "TARGET_POWER10 && can_create_pseudo_p ()"
>> +  "#"
>> +  "&& 1"
>> +  [(const_int 0)]
>> +{
>> +  rtx tmp1 = gen_reg_rtx (<MODE>mode);
>> +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
>> +  emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode),
>> +					      operands[1]));
>> +  emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2],
>> +					     CONST0_RTX (<VEC_base>mode)));
>> +  emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2));
>> +  DONE;
>> +})
>> +
>>  ;; Combiner patterns to allow creating XXPERMDI's to access either double
>>  ;; word element in a vector register.
>>  (define_insn "*vsx_concat_<mode>_1"
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c
>> new file mode 100644
>> index 00000000000..b2a06fb2162
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c
>> @@ -0,0 +1,17 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
>> +
>> +vector double test (double *a, double *b)
>> +{
>> +  return (vector double) {*a, *b};
>> +}
>> +
>> +vector long long test1 (long long *a, long long *b)
>> +{
>> +  return (vector long long) {*a, *b};
>> +}
>> +
>> +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */
>> +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */
>> +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
>> +
Kewen.Lin July 18, 2024, 9:13 a.m. UTC | #3
Hi Haochen,

on 2024/5/31 11:25, HAO CHEN GUI wrote:
> Hi,
>   This patch optimizes vector construction with two vector doubleword loads.
> It generates an optimal insn sequence as "xxlor" has lower latency than
> "mtvsrdd" on Power10.
> 
>   Compared with previous version, the main change is to use "isa" attribute
> to guard "lxsd" and "lxsdx".
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no
> regressions. OK for the trunk?
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> rs6000: Optimize vector construction with two vector doubleword loads
> 
> When constructing a vector by two doublewords from memory, originally it
> does
> 	ld 10,0(3)
> 	ld 9,0(4)
> 	mtvsrdd 34,9,10
> 
> An optimal sequence on Power10 should be
> 	lxsd 0,0(4)
> 	lxvrdx 1,0,3
> 	xxlor 34,1,32

Thanks for doing this, as PR #c0, could you also evaluate if it can actually
help SPEC2017 bmk 510.parest_r on Power10?

> 
> This patch does this optimization by insn combine and split.
> 
> gcc/
> 	PR target/103568
> 	* config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn
> 	pattern.
> 	(vsx_ld_highpart_zero_<mode>): New insn pattern.
> 	(vsx_concat_mem_<mode>): New insn_and_split pattern.
> 
> gcc/testsuite/
> 	PR target/103568
> 	* gcc.target/powerpc/pr103568.c: New test.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index f135fa079bd..f9a2a260e89 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di"
>    "lxvd2x %x0,%y1"
>    [(set_attr "type" "vecload")])
> 
> +(define_insn "vsx_ld_lowpart_zero_<mode>"

Nit: Maybe just use mnemonic in the name? 

> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
> +	(vec_concat:VSX_D
> +	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
> +	  (match_operand:<VEC_base> 2 "zero_constant" "j,j")))]

I think we should consider BE and LE here, this pattern only
matches the underlying insn on BE, we need a new pattern for LE
by swapping operand 1 and operand 2.

> +  ""
> +  "@
> +   lxsd %0,%1
> +   lxsdx %x0,%y1"
> +  [(set_attr "type" "vecload,vecload")
> +   (set_attr "isa" "p9v,p7v")])

Guarding this semantic with pre-p10 isa is wrong here, these two
insns are not guaranteed to have zero doubleword 1 semantic
on pre-Power10 like Power9 etc.

ISA 3.1

  The contents of doubleword element 1 of VSR[VRT+32]
  are set to 0.

ISA 3.0...2.06

  The contents of doubleword element 1 of VSR[XT] are
  undefined.

> +
> +(define_insn "vsx_ld_highpart_zero_<mode>"
> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
> +	(vec_concat:VSX_D
> +	  (match_operand:<VEC_base> 1 "zero_constant" "j")
> +	  (match_operand:<VEC_base> 2 "memory_operand" "Z")))]

Likewise on the pattern semantic.

> +  "TARGET_POWER10"
> +  "lxvrdx %x0,%y2"
> +  [(set_attr "type" "vecload")])
> +
>  (define_insn "vsx_ld_elemrev_v1ti"
>    [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
>          (vec_select:V1TI
> @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>"
>  }
>    [(set_attr "type" "vecperm,vecmove")])
> 
> +(define_insn_and_split "vsx_concat_mem_<mode>"
> +  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
> +	(vec_concat:VSX_D
> +	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
> +	  (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))]
> +  "TARGET_POWER10 && can_create_pseudo_p ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  rtx tmp1 = gen_reg_rtx (<MODE>mode);
> +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> +  emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode),
> +					      operands[1]));
> +  emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2],
> +					     CONST0_RTX (<VEC_base>mode)));
> +  emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2));
> +  DONE;
> +})
> +
>  ;; Combiner patterns to allow creating XXPERMDI's to access either double
>  ;; word element in a vector register.
>  (define_insn "*vsx_concat_<mode>_1"
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c
> new file mode 100644
> index 00000000000..b2a06fb2162
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +vector double test (double *a, double *b)
> +{
> +  return (vector double) {*a, *b};
> +}
> +
> +vector long long test1 (long long *a, long long *b)
> +{
> +  return (vector long long) {*a, *b};
> +}
> +
> +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */
> +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
> +

BR,
Kewen
diff mbox series

Patch

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index f135fa079bd..f9a2a260e89 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1395,6 +1395,27 @@  (define_insn "vsx_ld_elemrev_v2di"
   "lxvd2x %x0,%y1"
   [(set_attr "type" "vecload")])

+(define_insn "vsx_ld_lowpart_zero_<mode>"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
+	  (match_operand:<VEC_base> 2 "zero_constant" "j,j")))]
+  ""
+  "@
+   lxsd %0,%1
+   lxsdx %x0,%y1"
+  [(set_attr "type" "vecload,vecload")
+   (set_attr "isa" "p9v,p7v")])
+
+(define_insn "vsx_ld_highpart_zero_<mode>"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "zero_constant" "j")
+	  (match_operand:<VEC_base> 2 "memory_operand" "Z")))]
+  "TARGET_POWER10"
+  "lxvrdx %x0,%y2"
+  [(set_attr "type" "vecload")])
+
 (define_insn "vsx_ld_elemrev_v1ti"
   [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
         (vec_select:V1TI
@@ -3063,6 +3084,26 @@  (define_insn "vsx_concat_<mode>"
 }
   [(set_attr "type" "vecperm,vecmove")])

+(define_insn_and_split "vsx_concat_mem_<mode>"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
+	  (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))]
+  "TARGET_POWER10 && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx tmp1 = gen_reg_rtx (<MODE>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode),
+					      operands[1]));
+  emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2],
+					     CONST0_RTX (<VEC_base>mode)));
+  emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2));
+  DONE;
+})
+
 ;; Combiner patterns to allow creating XXPERMDI's to access either double
 ;; word element in a vector register.
 (define_insn "*vsx_concat_<mode>_1"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c
new file mode 100644
index 00000000000..b2a06fb2162
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+vector double test (double *a, double *b)
+{
+  return (vector double) {*a, *b};
+}
+
+vector long long test1 (long long *a, long long *b)
+{
+  return (vector long long) {*a, *b};
+}
+
+/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
+