Message ID | 485dc9bf-fb4d-4a63-e9ec-0e58715d16da@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [PATCHv2,rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769] | expand |
Hi Haochen, on 2023/7/25 10:10, HAO CHEN GUI wrote: > Hi, > This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx > for all subtargets when the mode is V4SI and the index of extracted element > is 1 for BE and 2 for LE. Also this patch adds a insn pattern for mfvsrwz > which helps eliminate redundant zero extend. > > Compared to last version, the main change is to move "vsx_extract_v4si_w1" > and "*mfvsrwz" to the front of "*vsx_extract_<mode>_di_p9". Also some insn > conditions are changed to assertions. > https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625128.html Since the previous one is v2, this is actually v3. ;-) > > Bootstrapped and tested on powerpc64-linux BE and LE with no regressions. > > Thanks > Gui Haochen > > ChangeLog > rs6000: Generate mfvsrwz for all platform and remove redundant zero extend > > mfvsrwz has lower latency than xxextractuw or vextuw[lr]x. So it should be > generated even with p9 vector enabled. Also the instruction is already > zero extended. A combine pattern is needed to eliminate redundant zero > extend instructions. > > gcc/ > PR target/106769 > * config/rs6000/vsx.md (expand vsx_extract_<mode>): Set it only > for V8HI and V16QI. > (vsx_extract_v4si): New expand for V4SI extraction. > (vsx_extract_v4si_w1): New insn pattern for V4SI extraction > when the index of extracted element is 1 with BE and 2 with LE. Nit: Maybe better to match the name with " ... for V4SI extraction on word 1 from BE order." > (*mfvsrwz): New insn pattern. > (*vsx_extract_<mode>_di_p9): Not generate the insn when the index > of extracted element is 1 with BE and 2 with LE. > (*vsx_extract_si): Removed. Nit: s/Removed/Remove/ > (*vsx_extract_v4si_not_w1): New insn and split pattern which deals > with the cases not handled by vsx_extract_v4si_w1. > > gcc/testsuite/ > PR target/106769 > * gcc.target/powerpc/pr106769.h: New. > * gcc.target/powerpc/pr106769-p8.c: New. > * gcc.target/powerpc/pr106769-p9.c: New. > > patch.diff > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index 0a34ceebeb5..0065b76fef8 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1" > (define_expand "vsx_extract_<mode>" > [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand") > (vec_select:<VEC_base> > - (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand") > + (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand") > (parallel [(match_operand:QI 2 "const_int_operand")]))) > - (clobber (match_scratch:VSX_EXTRACT_I 3))])] > + (clobber (match_scratch:VSX_EXTRACT_I2 3))])] > "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT" > { > /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}. */ > @@ -3736,6 +3736,63 @@ (define_expand "vsx_extract_<mode>" > } > }) > > +(define_expand "vsx_extract_v4si" > + [(parallel [(set (match_operand:SI 0 "gpc_reg_operand") > + (vec_select:SI > + (match_operand:V4SI 1 "gpc_reg_operand") > + (parallel [(match_operand:QI 2 "const_0_to_3_operand")]))) > + (clobber (match_scratch:V4SI 3))])] > + "TARGET_DIRECT_MOVE_64BIT" > +{ > + /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx. So just > + fall through to vsx_extract_v4si_w1. */ > + if (TARGET_P9_VECTOR > + && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)) > + { > + emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1], > + operands[2])); > + DONE; > + } > +}) > + > +/* Extract from word 1 (BE order). */ Nit: Use semicolon ";" for comments to keep consistent with the others and what the doc says. > +(define_insn "vsx_extract_v4si_w1" > + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa") > + (vec_select:SI > + (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0") > + (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")]))) > + (clobber (match_scratch:V4SI 3 "=v,v,v,v"))] > + "TARGET_DIRECT_MOVE_64BIT > + && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" > +{ > + if (which_alternative == 0) > + return "mfvsrwz %0,%x1"; > + > + if (which_alternative == 1) > + return "xxlor %x0,%x1,%x1"; > + > + if (which_alternative == 2) > + return "stxsiwx %x1,%y0"; > + > + return ASM_COMMENT_START " vec_extract to same register"; > +} > + [(set_attr "type" "mfvsr,veclogical,fpstore,*") > + (set_attr "length" "4,4,4,0") > + (set_attr "isa" "p8v,*,p8v,*")]) > + > +(define_insn "*mfvsrwz" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (zero_extend:DI > + (vec_select:SI > + (match_operand:V4SI 1 "vsx_register_operand" "wa") > + (parallel [(match_operand:QI 2 "const_int_operand" "n")])))) > + (clobber (match_scratch:V4SI 3 "=v"))] > + "TARGET_DIRECT_MOVE_64BIT > + && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" > + "mfvsrwz %0,%x1" > + [(set_attr "type" "mfvsr") > + (set_attr "isa" "p8v")]) > + > (define_insn "vsx_extract_<mode>_p9" > [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>") > (vec_select:<VEC_base> > @@ -3798,7 +3855,7 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9" > (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>") > (parallel [(match_operand:QI 2 "const_int_operand" "n,n")])))) > (clobber (match_scratch:SI 3 "=r,X"))] > - "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB" > + "TARGET_VEXTRACTUB" Nit: I understand this change is based on TARGET_VEXTRACTUB implying the condition VECTOR_MEM_VSX_P (<MODE>mode) always holds, but this line change isn't necessary in this patch and we have a few conditions like this, so I prefer to just leave this alone. > "#" > "&& reload_completed" > [(parallel [(set (match_dup 4) > @@ -3807,6 +3864,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9" > (parallel [(match_dup 2)]))) > (clobber (match_dup 3))])] > { > + gcc_assert (<MODE>mode != V4SImode > + || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)); > + > operands[4] = gen_rtx_REG (<VEC_base>mode, REGNO (operands[0])); > } > [(set_attr "isa" "p9v,*")]) > @@ -3830,58 +3890,43 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9" > (set (match_dup 0) > (match_dup 3))]) > > -(define_insn_and_split "*vsx_extract_si" > +/* Extract from word 0, 2, 3 (BE order). */ Nit: Use ";". > +(define_insn_and_split "*vsx_extract_v4si_not_w1" Nit: Maybe name it "...v4si_w023" to match the comment? > [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z") > (vec_select:SI > (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v") > (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")]))) > (clobber (match_scratch:V4SI 3 "=v,v,v"))] > - "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR" > + "TARGET_DIRECT_MOVE_64BIT > + && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)" I think the case "INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" should be already matched by vsx_extract_v4si_w1 above, so move this ... > "#" > - "&& reload_completed" > + "&& 1" > [(const_int 0)] > { > + gcc_assert (!TARGET_P9_VECTOR); ... here as an assertion too. > + > rtx dest = operands[0]; > rtx src = operands[1]; > rtx element = operands[2]; > - rtx vec_tmp = operands[3]; > - int value; > + rtx vec_tmp; > + > + if (GET_CODE (operands[3]) == SCRATCH) > + vec_tmp = gen_reg_rtx (V4SImode); > + else > + vec_tmp = operands[3]; > > /* Adjust index for LE element ordering, the below minuend 3 is computed by > GET_MODE_NUNITS (V4SImode) - 1. */ > if (!BYTES_BIG_ENDIAN) > element = GEN_INT (3 - INTVAL (element)); > > - /* If the value is in the correct position, we can avoid doing the VSPLT<x> > - instruction. */ > - value = INTVAL (element); > - if (value != 1) > - emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element)); > - else > - vec_tmp = src; > - > - if (MEM_P (operands[0])) > - { > - if (can_create_pseudo_p ()) > - dest = rs6000_force_indexed_or_indirect_mem (dest); > - > - if (TARGET_P8_VECTOR) > - emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp))); > - else > - emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp)))); > - } > + emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element)); > > - else if (TARGET_P8_VECTOR) > - emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp))); > - else > - emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)), > - gen_rtx_REG (DImode, REGNO (vec_tmp))); > + int value = BYTES_BIG_ENDIAN ? 1 : 2; > + emit_insn (gen_vsx_extract_v4si_w1 (dest, vec_tmp, GEN_INT (value))); > > DONE; > -} > - [(set_attr "type" "mfvsr,vecperm,fpstore") > - (set_attr "length" "8") > - (set_attr "isa" "*,p8v,*")]) > +}) > > (define_insn_and_split "*vsx_extract_<mode>_p8" > [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r") > diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c > new file mode 100644 > index 00000000000..e7cdbc76298 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ > +/* { dg-require-effective-target powerpc_p8vector_ok } */ > +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */ > +/* { dg-require-effective-target has_arch_ppc64 } */ > + > +#include "pr106769.h" > + > +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */ > +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */ > +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c > new file mode 100644 > index 00000000000..2205e434a86 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ > +/* { dg-require-effective-target powerpc_p9vector_ok } */ > +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */ > +/* { dg-require-effective-target has_arch_ppc64 } */ > + > +#include "pr106769.h" > + > +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */ > +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */ > +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */ > +/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */ > +/* { dg-final { scan-assembler-not "vextuw\[rl\]x" } } */ Nit: I think the below with \m and \M should work: /* { dg-final { scan-assembler-not {\mvextuw[rl]x\M} } } */ Okay for trunk with all the nits above tweaked, thanks! BR, Kewen
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 0a34ceebeb5..0065b76fef8 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1" (define_expand "vsx_extract_<mode>" [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand") (vec_select:<VEC_base> - (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand") + (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand") (parallel [(match_operand:QI 2 "const_int_operand")]))) - (clobber (match_scratch:VSX_EXTRACT_I 3))])] + (clobber (match_scratch:VSX_EXTRACT_I2 3))])] "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT" { /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}. */ @@ -3736,6 +3736,63 @@ (define_expand "vsx_extract_<mode>" } }) +(define_expand "vsx_extract_v4si" + [(parallel [(set (match_operand:SI 0 "gpc_reg_operand") + (vec_select:SI + (match_operand:V4SI 1 "gpc_reg_operand") + (parallel [(match_operand:QI 2 "const_0_to_3_operand")]))) + (clobber (match_scratch:V4SI 3))])] + "TARGET_DIRECT_MOVE_64BIT" +{ + /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx. So just + fall through to vsx_extract_v4si_w1. */ + if (TARGET_P9_VECTOR + && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)) + { + emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1], + operands[2])); + DONE; + } +}) + +/* Extract from word 1 (BE order). */ +(define_insn "vsx_extract_v4si_w1" + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa") + (vec_select:SI + (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0") + (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")]))) + (clobber (match_scratch:V4SI 3 "=v,v,v,v"))] + "TARGET_DIRECT_MOVE_64BIT + && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" +{ + if (which_alternative == 0) + return "mfvsrwz %0,%x1"; + + if (which_alternative == 1) + return "xxlor %x0,%x1,%x1"; + + if (which_alternative == 2) + return "stxsiwx %x1,%y0"; + + return ASM_COMMENT_START " vec_extract to same register"; +} + [(set_attr "type" "mfvsr,veclogical,fpstore,*") + (set_attr "length" "4,4,4,0") + (set_attr "isa" "p8v,*,p8v,*")]) + +(define_insn "*mfvsrwz" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (vec_select:SI + (match_operand:V4SI 1 "vsx_register_operand" "wa") + (parallel [(match_operand:QI 2 "const_int_operand" "n")])))) + (clobber (match_scratch:V4SI 3 "=v"))] + "TARGET_DIRECT_MOVE_64BIT + && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)" + "mfvsrwz %0,%x1" + [(set_attr "type" "mfvsr") + (set_attr "isa" "p8v")]) + (define_insn "vsx_extract_<mode>_p9" [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>") (vec_select:<VEC_base> @@ -3798,7 +3855,7 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9" (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>") (parallel [(match_operand:QI 2 "const_int_operand" "n,n")])))) (clobber (match_scratch:SI 3 "=r,X"))] - "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB" + "TARGET_VEXTRACTUB" "#" "&& reload_completed" [(parallel [(set (match_dup 4) @@ -3807,6 +3864,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9" (parallel [(match_dup 2)]))) (clobber (match_dup 3))])] { + gcc_assert (<MODE>mode != V4SImode + || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)); + operands[4] = gen_rtx_REG (<VEC_base>mode, REGNO (operands[0])); } [(set_attr "isa" "p9v,*")]) @@ -3830,58 +3890,43 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9" (set (match_dup 0) (match_dup 3))]) -(define_insn_and_split "*vsx_extract_si" +/* Extract from word 0, 2, 3 (BE order). */ +(define_insn_and_split "*vsx_extract_v4si_not_w1" [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z") (vec_select:SI (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v") (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")]))) (clobber (match_scratch:V4SI 3 "=v,v,v"))] - "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR" + "TARGET_DIRECT_MOVE_64BIT + && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)" "#" - "&& reload_completed" + "&& 1" [(const_int 0)] { + gcc_assert (!TARGET_P9_VECTOR); + rtx dest = operands[0]; rtx src = operands[1]; rtx element = operands[2]; - rtx vec_tmp = operands[3]; - int value; + rtx vec_tmp; + + if (GET_CODE (operands[3]) == SCRATCH) + vec_tmp = gen_reg_rtx (V4SImode); + else + vec_tmp = operands[3]; /* Adjust index for LE element ordering, the below minuend 3 is computed by GET_MODE_NUNITS (V4SImode) - 1. */ if (!BYTES_BIG_ENDIAN) element = GEN_INT (3 - INTVAL (element)); - /* If the value is in the correct position, we can avoid doing the VSPLT<x> - instruction. */ - value = INTVAL (element); - if (value != 1) - emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element)); - else - vec_tmp = src; - - if (MEM_P (operands[0])) - { - if (can_create_pseudo_p ()) - dest = rs6000_force_indexed_or_indirect_mem (dest); - - if (TARGET_P8_VECTOR) - emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp))); - else - emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp)))); - } + emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element)); - else if (TARGET_P8_VECTOR) - emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp))); - else - emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)), - gen_rtx_REG (DImode, REGNO (vec_tmp))); + int value = BYTES_BIG_ENDIAN ? 1 : 2; + emit_insn (gen_vsx_extract_v4si_w1 (dest, vec_tmp, GEN_INT (value))); DONE; -} - [(set_attr "type" "mfvsr,vecperm,fpstore") - (set_attr "length" "8") - (set_attr "isa" "*,p8v,*")]) +}) (define_insn_and_split "*vsx_extract_<mode>_p8" [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r") diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c new file mode 100644 index 00000000000..e7cdbc76298 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */ +/* { dg-require-effective-target has_arch_ppc64 } */ + +#include "pr106769.h" + +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */ +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */ +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c new file mode 100644 index 00000000000..2205e434a86 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */ +/* { dg-require-effective-target has_arch_ppc64 } */ + +#include "pr106769.h" + +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */ +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */ +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */ +/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */ +/* { dg-final { scan-assembler-not "vextuw\[rl\]x" } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769.h b/gcc/testsuite/gcc.target/powerpc/pr106769.h new file mode 100644 index 00000000000..1c8c8a024f3 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr106769.h @@ -0,0 +1,17 @@ +#include <altivec.h> + +#ifdef __BIG_ENDIAN__ +#define LANE 1 +#else +#define LANE 2 +#endif + +unsigned int foo1 (vector unsigned int v) +{ + return vec_extract(v, LANE); +} + +void foo2 (vector unsigned int v, unsigned int* p) +{ + *p = vec_extract(v, LANE); +}