Message ID | 8b8d0cdc-7725-4cb6-a31f-257392101b7a@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [PATCHv2,rs6000] Optimize vector construction with two vector doubleword loads [PR103568] | expand |
Hi, Gently ping it. https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html Thanks Gui Haochen 在 2024/5/31 11:25, HAO CHEN GUI 写道: > Hi, > This patch optimizes vector construction with two vector doubleword loads. > It generates an optimal insn sequence as "xxlor" has lower latency than > "mtvsrdd" on Power10. > > Compared with previous version, the main change is to use "isa" attribute > to guard "lxsd" and "lxsdx". > https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html > > Bootstrapped and tested on powerpc64-linux BE and LE with no > regressions. OK for the trunk? > > Thanks > Gui Haochen > > ChangeLog > rs6000: Optimize vector construction with two vector doubleword loads > > When constructing a vector by two doublewords from memory, originally it > does > ld 10,0(3) > ld 9,0(4) > mtvsrdd 34,9,10 > > An optimal sequence on Power10 should be > lxsd 0,0(4) > lxvrdx 1,0,3 > xxlor 34,1,32 > > This patch does this optimization by insn combine and split. > > gcc/ > PR target/103568 > * config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn > pattern. > (vsx_ld_highpart_zero_<mode>): New insn pattern. > (vsx_concat_mem_<mode>): New insn_and_split pattern. > > gcc/testsuite/ > PR target/103568 > * gcc.target/powerpc/pr103568.c: New test. > > patch.diff > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index f135fa079bd..f9a2a260e89 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di" > "lxvd2x %x0,%y1" > [(set_attr "type" "vecload")]) > > +(define_insn "vsx_ld_lowpart_zero_<mode>" > + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") > + (vec_concat:VSX_D > + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") > + (match_operand:<VEC_base> 2 "zero_constant" "j,j")))] > + "" > + "@ > + lxsd %0,%1 > + lxsdx %x0,%y1" > + [(set_attr "type" "vecload,vecload") > + (set_attr "isa" "p9v,p7v")]) > + > +(define_insn "vsx_ld_highpart_zero_<mode>" > + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") > + (vec_concat:VSX_D > + (match_operand:<VEC_base> 1 "zero_constant" "j") > + (match_operand:<VEC_base> 2 "memory_operand" "Z")))] > + "TARGET_POWER10" > + "lxvrdx %x0,%y2" > + [(set_attr "type" "vecload")]) > + > (define_insn "vsx_ld_elemrev_v1ti" > [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa") > (vec_select:V1TI > @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>" > } > [(set_attr "type" "vecperm,vecmove")]) > > +(define_insn_and_split "vsx_concat_mem_<mode>" > + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") > + (vec_concat:VSX_D > + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") > + (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))] > + "TARGET_POWER10 && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > +{ > + rtx tmp1 = gen_reg_rtx (<MODE>mode); > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode), > + operands[1])); > + emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2], > + CONST0_RTX (<VEC_base>mode))); > + emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2)); > + DONE; > +}) > + > ;; Combiner patterns to allow creating XXPERMDI's to access either double > ;; word element in a vector register. > (define_insn "*vsx_concat_<mode>_1" > diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c > new file mode 100644 > index 00000000000..b2a06fb2162 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +vector double test (double *a, double *b) > +{ > + return (vector double) {*a, *b}; > +} > + > +vector long long test1 (long long *a, long long *b) > +{ > + return (vector long long) {*a, *b}; > +} > + > +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */ > +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ > +
Hi, Gently ping it. https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html Thanks Gui Haochen 在 2024/6/20 15:01, HAO CHEN GUI 写道: > Hi, > Gently ping it. > https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html > > Thanks > Gui Haochen > > 在 2024/5/31 11:25, HAO CHEN GUI 写道: >> Hi, >> This patch optimizes vector construction with two vector doubleword loads. >> It generates an optimal insn sequence as "xxlor" has lower latency than >> "mtvsrdd" on Power10. >> >> Compared with previous version, the main change is to use "isa" attribute >> to guard "lxsd" and "lxsdx". >> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html >> >> Bootstrapped and tested on powerpc64-linux BE and LE with no >> regressions. OK for the trunk? >> >> Thanks >> Gui Haochen >> >> ChangeLog >> rs6000: Optimize vector construction with two vector doubleword loads >> >> When constructing a vector by two doublewords from memory, originally it >> does >> ld 10,0(3) >> ld 9,0(4) >> mtvsrdd 34,9,10 >> >> An optimal sequence on Power10 should be >> lxsd 0,0(4) >> lxvrdx 1,0,3 >> xxlor 34,1,32 >> >> This patch does this optimization by insn combine and split. >> >> gcc/ >> PR target/103568 >> * config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn >> pattern. >> (vsx_ld_highpart_zero_<mode>): New insn pattern. >> (vsx_concat_mem_<mode>): New insn_and_split pattern. >> >> gcc/testsuite/ >> PR target/103568 >> * gcc.target/powerpc/pr103568.c: New test. >> >> patch.diff >> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md >> index f135fa079bd..f9a2a260e89 100644 >> --- a/gcc/config/rs6000/vsx.md >> +++ b/gcc/config/rs6000/vsx.md >> @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di" >> "lxvd2x %x0,%y1" >> [(set_attr "type" "vecload")]) >> >> +(define_insn "vsx_ld_lowpart_zero_<mode>" >> + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") >> + (vec_concat:VSX_D >> + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") >> + (match_operand:<VEC_base> 2 "zero_constant" "j,j")))] >> + "" >> + "@ >> + lxsd %0,%1 >> + lxsdx %x0,%y1" >> + [(set_attr "type" "vecload,vecload") >> + (set_attr "isa" "p9v,p7v")]) >> + >> +(define_insn "vsx_ld_highpart_zero_<mode>" >> + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") >> + (vec_concat:VSX_D >> + (match_operand:<VEC_base> 1 "zero_constant" "j") >> + (match_operand:<VEC_base> 2 "memory_operand" "Z")))] >> + "TARGET_POWER10" >> + "lxvrdx %x0,%y2" >> + [(set_attr "type" "vecload")]) >> + >> (define_insn "vsx_ld_elemrev_v1ti" >> [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa") >> (vec_select:V1TI >> @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>" >> } >> [(set_attr "type" "vecperm,vecmove")]) >> >> +(define_insn_and_split "vsx_concat_mem_<mode>" >> + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") >> + (vec_concat:VSX_D >> + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") >> + (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))] >> + "TARGET_POWER10 && can_create_pseudo_p ()" >> + "#" >> + "&& 1" >> + [(const_int 0)] >> +{ >> + rtx tmp1 = gen_reg_rtx (<MODE>mode); >> + rtx tmp2 = gen_reg_rtx (<MODE>mode); >> + emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode), >> + operands[1])); >> + emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2], >> + CONST0_RTX (<VEC_base>mode))); >> + emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2)); >> + DONE; >> +}) >> + >> ;; Combiner patterns to allow creating XXPERMDI's to access either double >> ;; word element in a vector register. >> (define_insn "*vsx_concat_<mode>_1" >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c >> new file mode 100644 >> index 00000000000..b2a06fb2162 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >> + >> +vector double test (double *a, double *b) >> +{ >> + return (vector double) {*a, *b}; >> +} >> + >> +vector long long test1 (long long *a, long long *b) >> +{ >> + return (vector long long) {*a, *b}; >> +} >> + >> +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */ >> +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */ >> +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ >> +
Hi Haochen, on 2024/5/31 11:25, HAO CHEN GUI wrote: > Hi, > This patch optimizes vector construction with two vector doubleword loads. > It generates an optimal insn sequence as "xxlor" has lower latency than > "mtvsrdd" on Power10. > > Compared with previous version, the main change is to use "isa" attribute > to guard "lxsd" and "lxsdx". > https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653103.html > > Bootstrapped and tested on powerpc64-linux BE and LE with no > regressions. OK for the trunk? > > Thanks > Gui Haochen > > ChangeLog > rs6000: Optimize vector construction with two vector doubleword loads > > When constructing a vector by two doublewords from memory, originally it > does > ld 10,0(3) > ld 9,0(4) > mtvsrdd 34,9,10 > > An optimal sequence on Power10 should be > lxsd 0,0(4) > lxvrdx 1,0,3 > xxlor 34,1,32 Thanks for doing this, as PR #c0, could you also evaluate if it can actually help SPEC2017 bmk 510.parest_r on Power10? > > This patch does this optimization by insn combine and split. > > gcc/ > PR target/103568 > * config/rs6000/vsx.md (vsx_ld_lowpart_zero_<mode>): New insn > pattern. > (vsx_ld_highpart_zero_<mode>): New insn pattern. > (vsx_concat_mem_<mode>): New insn_and_split pattern. > > gcc/testsuite/ > PR target/103568 > * gcc.target/powerpc/pr103568.c: New test. > > patch.diff > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index f135fa079bd..f9a2a260e89 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di" > "lxvd2x %x0,%y1" > [(set_attr "type" "vecload")]) > > +(define_insn "vsx_ld_lowpart_zero_<mode>" Nit: Maybe just use mnemonic in the name? > + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") > + (vec_concat:VSX_D > + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") > + (match_operand:<VEC_base> 2 "zero_constant" "j,j")))] I think we should consider BE and LE here, this pattern only matches the underlying insn on BE, we need a new pattern for LE by swapping operand 1 and operand 2. > + "" > + "@ > + lxsd %0,%1 > + lxsdx %x0,%y1" > + [(set_attr "type" "vecload,vecload") > + (set_attr "isa" "p9v,p7v")]) Guarding this semantic with pre-p10 isa is wrong here, these two insns are not guaranteed to have zero doubleword 1 semantic on pre-Power10 like Power9 etc. ISA 3.1 The contents of doubleword element 1 of VSR[VRT+32] are set to 0. ISA 3.0...2.06 The contents of doubleword element 1 of VSR[XT] are undefined. > + > +(define_insn "vsx_ld_highpart_zero_<mode>" > + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") > + (vec_concat:VSX_D > + (match_operand:<VEC_base> 1 "zero_constant" "j") > + (match_operand:<VEC_base> 2 "memory_operand" "Z")))] Likewise on the pattern semantic. > + "TARGET_POWER10" > + "lxvrdx %x0,%y2" > + [(set_attr "type" "vecload")]) > + > (define_insn "vsx_ld_elemrev_v1ti" > [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa") > (vec_select:V1TI > @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>" > } > [(set_attr "type" "vecperm,vecmove")]) > > +(define_insn_and_split "vsx_concat_mem_<mode>" > + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") > + (vec_concat:VSX_D > + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") > + (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))] > + "TARGET_POWER10 && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > +{ > + rtx tmp1 = gen_reg_rtx (<MODE>mode); > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode), > + operands[1])); > + emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2], > + CONST0_RTX (<VEC_base>mode))); > + emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2)); > + DONE; > +}) > + > ;; Combiner patterns to allow creating XXPERMDI's to access either double > ;; word element in a vector register. > (define_insn "*vsx_concat_<mode>_1" > diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c > new file mode 100644 > index 00000000000..b2a06fb2162 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +vector double test (double *a, double *b) > +{ > + return (vector double) {*a, *b}; > +} > + > +vector long long test1 (long long *a, long long *b) > +{ > + return (vector long long) {*a, *b}; > +} > + > +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */ > +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ > + BR, Kewen
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index f135fa079bd..f9a2a260e89 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -1395,6 +1395,27 @@ (define_insn "vsx_ld_elemrev_v2di" "lxvd2x %x0,%y1" [(set_attr "type" "vecload")]) +(define_insn "vsx_ld_lowpart_zero_<mode>" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") + (vec_concat:VSX_D + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") + (match_operand:<VEC_base> 2 "zero_constant" "j,j")))] + "" + "@ + lxsd %0,%1 + lxsdx %x0,%y1" + [(set_attr "type" "vecload,vecload") + (set_attr "isa" "p9v,p7v")]) + +(define_insn "vsx_ld_highpart_zero_<mode>" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") + (vec_concat:VSX_D + (match_operand:<VEC_base> 1 "zero_constant" "j") + (match_operand:<VEC_base> 2 "memory_operand" "Z")))] + "TARGET_POWER10" + "lxvrdx %x0,%y2" + [(set_attr "type" "vecload")]) + (define_insn "vsx_ld_elemrev_v1ti" [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa") (vec_select:V1TI @@ -3063,6 +3084,26 @@ (define_insn "vsx_concat_<mode>" } [(set_attr "type" "vecperm,vecmove")]) +(define_insn_and_split "vsx_concat_mem_<mode>" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa") + (vec_concat:VSX_D + (match_operand:<VEC_base> 1 "memory_operand" "wY,Z") + (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))] + "TARGET_POWER10 && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx tmp1 = gen_reg_rtx (<MODE>mode); + rtx tmp2 = gen_reg_rtx (<MODE>mode); + emit_insn (gen_vsx_ld_highpart_zero_<mode> (tmp1, CONST0_RTX (<VEC_base>mode), + operands[1])); + emit_insn (gen_vsx_ld_lowpart_zero_<mode> (tmp2, operands[2], + CONST0_RTX (<VEC_base>mode))); + emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2)); + DONE; +}) + ;; Combiner patterns to allow creating XXPERMDI's to access either double ;; word element in a vector register. (define_insn "*vsx_concat_<mode>_1" diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c new file mode 100644 index 00000000000..b2a06fb2162 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +vector double test (double *a, double *b) +{ + return (vector double) {*a, *b}; +} + +vector long long test1 (long long *a, long long *b) +{ + return (vector long long) {*a, *b}; +} + +/* { dg-final { scan-assembler-times {\mlxsd} 2 } } */ +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */ +