Message ID | 20240815074009.128776-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [v2,x86] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload. | expand |
On Thu, Aug 15, 2024 at 3:27 PM liuhongt <hongtao.liu@intel.com> wrote: > > It results in 2 failures for x86_64-pc-linux-gnu{\ > -march=cascadelake}; > > gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o > gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1 > > For pr113560.c, now GCC generates mulx instead of mulq with > -march=cascadelake, which should be optimal, so adjust testcase for > that. > For gcc.target/i386/extendditi2-1.c, RA happens to choose another > register instead of rax and result in > > movq %rdi, %rbp > movq %rdi, %rax > sarq $63, %rbp > movq %rbp, %rdx > > The patch adds a new define_peephole2 for that. > > gcc/ChangeLog: > > PR target/116274 > * config/i386/i386-expand.cc (ix86_expand_vector_move): > Restrict special case TImode to 128-bit vector conversions via > V2DI under ix86_pre_reload_split (). > * config/i386/i386.cc (inline_secondary_memory_needed): > Movement between GENERAL_REGS and SSE_REGS for TImode doesn't > need secondary reload. > * config/i386/i386.md (*extendsidi2_rex64): Add a > define_peephole2 after it. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr116274.c: New test. > * gcc.target/i386/pr113560.c: Scan either mulq or mulx. > --- > gcc/config/i386/i386-expand.cc | 2 +- > gcc/config/i386/i386.cc | 18 ++++++++++++------ > gcc/config/i386/i386.md | 19 +++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr113560.c | 2 +- > gcc/testsuite/gcc.target/i386/pr116274.c | 12 ++++++++++++ > 5 files changed, 45 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index bdbc1423267..ed546eeed6b 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) > && SUBREG_P (op1) > && GET_MODE (SUBREG_REG (op1)) == TImode > && TARGET_64BIT && TARGET_SSE > - && can_create_pseudo_p ()) > + && ix86_pre_reload_split ()) > { > rtx tmp = gen_reg_rtx (V2DImode); > rtx lo = gen_reg_rtx (DImode); > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index f044826269c..4821892d1e0 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, > if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2))) > return true; > > + /* If the target says that inter-unit moves are more expensive > + than moving through memory, then don't generate them. */ > + if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) > + || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) > + return true; > + > + /* Under SSE4.1, *movti_internal supports movement between > + SSE_REGS and GENERAL_REGS with pinsrq and pextrq. */ > + if (TARGET_SSE4_1 > + && (TARGET_64BIT ? mode == TImode : mode == DImode)) > + return false; > + > int msize = GET_MODE_SIZE (mode); > > /* Between SSE and general, we have moves no larger than word size. */ > @@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, > > if (msize < minsize) > return true; > - > - /* If the target says that inter-unit moves are more expensive > - than moving through memory, then don't generate them. */ > - if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) > - || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) > - return true; > } > > return false; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index db7789c17d2..1962a7ba5c9 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -5041,6 +5041,25 @@ (define_split > DONE; > }) > > +(define_peephole2 > + [(set (match_operand:DI 0 "general_reg_operand") > + (match_operand:DI 1 "general_reg_operand")) > + (parallel [(set (match_dup 0) > + (ashiftrt:DI (match_dup 0) > + (const_int 63))) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_operand:DI 2 "general_reg_operand") (match_dup 1)) > + (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))] > + "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) > + && REGNO (operands[2]) == AX_REG > + && REGNO (operands[3]) == DX_REG > + && peep2_reg_dead_p (4, operands[0]) > + && !reg_mentioned_p (operands[0], operands[1]) > + && !reg_mentioned_p (operands[2], operands[0])" > + [(set (match_dup 2) (match_dup 1)) > + (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63))) > + (clobber (reg:CC FLAGS_REG))])]) > + > (define_insn "extend<mode>di2" > [(set (match_operand:DI 0 "register_operand" "=r") > (sign_extend:DI > diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c > index ac2e01a4589..9431a2d1d90 100644 > --- a/gcc/testsuite/gcc.target/i386/pr113560.c > +++ b/gcc/testsuite/gcc.target/i386/pr113560.c > @@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y) > return (x & 1000) * (y & 1000); > } > > -/* { dg-final { scan-assembler-times "\tmulq" 1 } } */ > +/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */ > /* { dg-final { scan-assembler-times "\timulq" 1 } } */ > /* { dg-final { scan-assembler-not "addq" } } */ > /* { dg-final { scan-assembler-not "xorl" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c > new file mode 100644 > index 00000000000..51ac3e1572d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr116274.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */ > + > +struct aq { long x,y; }; > +long testq(struct aq a) { return a.x+a.y; } > + > +struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; }; > +short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; } > + > +struct ad { int x,y,z,w; }; > +int testd(struct ad a) { return a.x+a.y+a.z+a.w; } > +/* { dg-final { scan-assembler-not "%rsp" } } */ > -- > 2.43.0 >
On Thu, Aug 15, 2024 at 9:27 AM liuhongt <hongtao.liu@intel.com> wrote: > > It results in 2 failures for x86_64-pc-linux-gnu{\ > -march=cascadelake}; > > gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o > gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1 > > For pr113560.c, now GCC generates mulx instead of mulq with > -march=cascadelake, which should be optimal, so adjust testcase for > that. > For gcc.target/i386/extendditi2-1.c, RA happens to choose another > register instead of rax and result in > > movq %rdi, %rbp > movq %rdi, %rax > sarq $63, %rbp > movq %rbp, %rdx > > The patch adds a new define_peephole2 for that. > > gcc/ChangeLog: > > PR target/116274 > * config/i386/i386-expand.cc (ix86_expand_vector_move): > Restrict special case TImode to 128-bit vector conversions via > V2DI under ix86_pre_reload_split (). > * config/i386/i386.cc (inline_secondary_memory_needed): > Movement between GENERAL_REGS and SSE_REGS for TImode doesn't > need secondary reload. > * config/i386/i386.md (*extendsidi2_rex64): Add a > define_peephole2 after it. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr116274.c: New test. > * gcc.target/i386/pr113560.c: Scan either mulq or mulx. OK, with updated comment, as proposed below. Thanks, Uros. > --- > gcc/config/i386/i386-expand.cc | 2 +- > gcc/config/i386/i386.cc | 18 ++++++++++++------ > gcc/config/i386/i386.md | 19 +++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr113560.c | 2 +- > gcc/testsuite/gcc.target/i386/pr116274.c | 12 ++++++++++++ > 5 files changed, 45 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index bdbc1423267..ed546eeed6b 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) > && SUBREG_P (op1) > && GET_MODE (SUBREG_REG (op1)) == TImode > && TARGET_64BIT && TARGET_SSE > - && can_create_pseudo_p ()) > + && ix86_pre_reload_split ()) > { > rtx tmp = gen_reg_rtx (V2DImode); > rtx lo = gen_reg_rtx (DImode); > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index f044826269c..4821892d1e0 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, > if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2))) > return true; > > + /* If the target says that inter-unit moves are more expensive > + than moving through memory, then don't generate them. */ > + if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) > + || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) > + return true; > + > + /* Under SSE4.1, *movti_internal supports movement between > + SSE_REGS and GENERAL_REGS with pinsrq and pextrq. */ WIth SSE4.1, *mov{ti,di}_internal supports moves between SSE_REGS and GENERAL_REGS using pinsr{q,d} or pextr{q,d}. > + if (TARGET_SSE4_1 > + && (TARGET_64BIT ? mode == TImode : mode == DImode)) > + return false; > + > int msize = GET_MODE_SIZE (mode); > > /* Between SSE and general, we have moves no larger than word size. */ > @@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, > > if (msize < minsize) > return true; > - > - /* If the target says that inter-unit moves are more expensive > - than moving through memory, then don't generate them. */ > - if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) > - || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) > - return true; > } > > return false; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index db7789c17d2..1962a7ba5c9 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -5041,6 +5041,25 @@ (define_split > DONE; > }) > > +(define_peephole2 > + [(set (match_operand:DI 0 "general_reg_operand") > + (match_operand:DI 1 "general_reg_operand")) > + (parallel [(set (match_dup 0) > + (ashiftrt:DI (match_dup 0) > + (const_int 63))) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_operand:DI 2 "general_reg_operand") (match_dup 1)) > + (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))] > + "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) > + && REGNO (operands[2]) == AX_REG > + && REGNO (operands[3]) == DX_REG > + && peep2_reg_dead_p (4, operands[0]) > + && !reg_mentioned_p (operands[0], operands[1]) > + && !reg_mentioned_p (operands[2], operands[0])" > + [(set (match_dup 2) (match_dup 1)) > + (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63))) > + (clobber (reg:CC FLAGS_REG))])]) > + > (define_insn "extend<mode>di2" > [(set (match_operand:DI 0 "register_operand" "=r") > (sign_extend:DI > diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c > index ac2e01a4589..9431a2d1d90 100644 > --- a/gcc/testsuite/gcc.target/i386/pr113560.c > +++ b/gcc/testsuite/gcc.target/i386/pr113560.c > @@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y) > return (x & 1000) * (y & 1000); > } > > -/* { dg-final { scan-assembler-times "\tmulq" 1 } } */ > +/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */ > /* { dg-final { scan-assembler-times "\timulq" 1 } } */ > /* { dg-final { scan-assembler-not "addq" } } */ > /* { dg-final { scan-assembler-not "xorl" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c > new file mode 100644 > index 00000000000..51ac3e1572d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr116274.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */ > + > +struct aq { long x,y; }; > +long testq(struct aq a) { return a.x+a.y; } > + > +struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; }; > +short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; } > + > +struct ad { int x,y,z,w; }; > +int testd(struct ad a) { return a.x+a.y+a.z+a.w; } > +/* { dg-final { scan-assembler-not "%rsp" } } */ > -- > 2.43.0 >
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index bdbc1423267..ed546eeed6b 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) && SUBREG_P (op1) && GET_MODE (SUBREG_REG (op1)) == TImode && TARGET_64BIT && TARGET_SSE - && can_create_pseudo_p ()) + && ix86_pre_reload_split ()) { rtx tmp = gen_reg_rtx (V2DImode); rtx lo = gen_reg_rtx (DImode); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index f044826269c..4821892d1e0 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2))) return true; + /* If the target says that inter-unit moves are more expensive + than moving through memory, then don't generate them. */ + if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) + || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) + return true; + + /* Under SSE4.1, *movti_internal supports movement between + SSE_REGS and GENERAL_REGS with pinsrq and pextrq. */ + if (TARGET_SSE4_1 + && (TARGET_64BIT ? mode == TImode : mode == DImode)) + return false; + int msize = GET_MODE_SIZE (mode); /* Between SSE and general, we have moves no larger than word size. */ @@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, if (msize < minsize) return true; - - /* If the target says that inter-unit moves are more expensive - than moving through memory, then don't generate them. */ - if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) - || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) - return true; } return false; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index db7789c17d2..1962a7ba5c9 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -5041,6 +5041,25 @@ (define_split DONE; }) +(define_peephole2 + [(set (match_operand:DI 0 "general_reg_operand") + (match_operand:DI 1 "general_reg_operand")) + (parallel [(set (match_dup 0) + (ashiftrt:DI (match_dup 0) + (const_int 63))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_operand:DI 2 "general_reg_operand") (match_dup 1)) + (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))] + "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) + && REGNO (operands[2]) == AX_REG + && REGNO (operands[3]) == DX_REG + && peep2_reg_dead_p (4, operands[0]) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[2], operands[0])" + [(set (match_dup 2) (match_dup 1)) + (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63))) + (clobber (reg:CC FLAGS_REG))])]) + (define_insn "extend<mode>di2" [(set (match_operand:DI 0 "register_operand" "=r") (sign_extend:DI diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c index ac2e01a4589..9431a2d1d90 100644 --- a/gcc/testsuite/gcc.target/i386/pr113560.c +++ b/gcc/testsuite/gcc.target/i386/pr113560.c @@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y) return (x & 1000) * (y & 1000); } -/* { dg-final { scan-assembler-times "\tmulq" 1 } } */ +/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */ /* { dg-final { scan-assembler-times "\timulq" 1 } } */ /* { dg-final { scan-assembler-not "addq" } } */ /* { dg-final { scan-assembler-not "xorl" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c b/gcc/testsuite/gcc.target/i386/pr116274.c new file mode 100644 index 00000000000..51ac3e1572d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116274.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O3 -fno-vect-cost-model -msse4.1" } */ + +struct aq { long x,y; }; +long testq(struct aq a) { return a.x+a.y; } + +struct aw { short a0,a1,a2,a3,a4,a5,a6,a7; }; +short testw(struct aw a) { return a.a0+a.a1+a.a2+a.a3+a.a4+a.a5+a.a6+a.a7; } + +struct ad { int x,y,z,w; }; +int testd(struct ad a) { return a.x+a.y+a.z+a.w; } +/* { dg-final { scan-assembler-not "%rsp" } } */