Message ID | 009201d9ba7c$a64374d0$f2ca5e70$@nextmovesoftware.com |
---|---|
State | New |
Headers | show |
Series | [x86_64] More TImode parameter passing improvements. | expand |
On Wed, Jul 19, 2023 at 10:07 PM Roger Sayle <roger@nextmovesoftware.com> wrote: > > > This patch is the next piece of a solution to the x86_64 ABI issues in > PR 88873. This splits the *concat<mode><dwi>3_3 define_insn_and_split > into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT > *concatsidi3_3. This allows us to add an additional alternative to the > the 64-bit version, enabling the register allocator to perform this > operation using SSE registers, which is implemented/split after reload > using vec_concatv2di. > > To demonstrate the improvement, the test case from PR88873: > > typedef struct { double x, y; } s_t; > > s_t foo (s_t a, s_t b, s_t c) > { > return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y, c.y) > }; > } > > when compiled with -O2 -march=cascadelake, currently generates: > > foo: vmovq %xmm2, -56(%rsp) > movq -56(%rsp), %rax > vmovq %xmm3, -48(%rsp) > vmovq %xmm4, -40(%rsp) > movq -48(%rsp), %rcx > vmovq %xmm5, -32(%rsp) > vmovq %rax, %xmm6 > movq -40(%rsp), %rax > movq -32(%rsp), %rsi > vpinsrq $1, %rcx, %xmm6, %xmm6 > vmovq %xmm0, -24(%rsp) > vmovq %rax, %xmm7 > vmovq %xmm1, -16(%rsp) > vmovapd %xmm6, %xmm2 > vpinsrq $1, %rsi, %xmm7, %xmm7 > vfmadd132pd -24(%rsp), %xmm7, %xmm2 > vmovapd %xmm2, -56(%rsp) > vmovsd -48(%rsp), %xmm1 > vmovsd -56(%rsp), %xmm0 > ret > > with this change, we avoid many of the reloads via memory, > > foo: vpunpcklqdq %xmm3, %xmm2, %xmm7 > vpunpcklqdq %xmm1, %xmm0, %xmm6 > vpunpcklqdq %xmm5, %xmm4, %xmm2 > vmovdqa %xmm7, -24(%rsp) > vmovdqa %xmm6, %xmm1 > movq -16(%rsp), %rax > vpinsrq $1, %rax, %xmm7, %xmm4 > vmovapd %xmm4, %xmm6 > vfmadd132pd %xmm1, %xmm2, %xmm6 > vmovapd %xmm6, -24(%rsp) > vmovsd -16(%rsp), %xmm1 > vmovsd -24(%rsp), %xmm0 > ret > > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > and make -k check, both with and without --target_board=unix{-m32} > with no new failures. Ok for mainline? > > > 2023-07-19 Roger Sayle <roger@nextmovesoftware.com> > > gcc/ChangeLog > * config/i386/i386-expand.cc (ix86_expand_move): Don't call > force_reg, to use SUBREG rather than create a new pseudo when > inserting DFmode fields into TImode with insvti_{high,low}part. > (*concat<mode><dwi>3_3): Split into two define_insn_and_split... > (*concatditi3_3): 64-bit implementation. Provide alternative > that allows register allocation to use SSE registers that is > split into vec_concatv2di after reload. > (*concatsidi3_3): 32-bit implementation. > > gcc/testsuite/ChangeLog > * gcc.target/i386/pr88873.c: New test case. diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index f9b0dc6..9c3febe 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx operands[]) op0 = SUBREG_REG (op0); tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); if (mode == DFmode) - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); + op1 = gen_lowpart (DImode, op1); Please note that gen_lowpart will ICE when op1 is a SUBREG. This is the reason that we need to first force a SUBREG to a register and then perform gen_lowpart, and it is necessary to avoid ICE. op1 = gen_rtx_ZERO_EXTEND (TImode, op1); op1 = gen_rtx_IOR (TImode, tmp, op1); } @@ -570,7 +570,7 @@ ix86_expand_move (machine_mode mode, rtx operands[]) op0 = SUBREG_REG (op0); tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); if (mode == DFmode) - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); + op1 = gen_lowpart (DImode, op1); Also here. op1 = gen_rtx_ZERO_EXTEND (TImode, op1); op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64)); op1 = gen_rtx_IOR (TImode, tmp, op1); Uros.
Hi Uros, > From: Uros Bizjak <ubizjak@gmail.com> > Sent: 20 July 2023 07:50 > > On Wed, Jul 19, 2023 at 10:07 PM Roger Sayle <roger@nextmovesoftware.com> > wrote: > > > > This patch is the next piece of a solution to the x86_64 ABI issues in > > PR 88873. This splits the *concat<mode><dwi>3_3 define_insn_and_split > > into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT > > *concatsidi3_3. This allows us to add an additional alternative to > > the the 64-bit version, enabling the register allocator to perform > > this operation using SSE registers, which is implemented/split after > > reload using vec_concatv2di. > > > > To demonstrate the improvement, the test case from PR88873: > > > > typedef struct { double x, y; } s_t; > > > > s_t foo (s_t a, s_t b, s_t c) > > { > > return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y, > > c.y) }; } > > > > when compiled with -O2 -march=cascadelake, currently generates: > > > > foo: vmovq %xmm2, -56(%rsp) > > movq -56(%rsp), %rax > > vmovq %xmm3, -48(%rsp) > > vmovq %xmm4, -40(%rsp) > > movq -48(%rsp), %rcx > > vmovq %xmm5, -32(%rsp) > > vmovq %rax, %xmm6 > > movq -40(%rsp), %rax > > movq -32(%rsp), %rsi > > vpinsrq $1, %rcx, %xmm6, %xmm6 > > vmovq %xmm0, -24(%rsp) > > vmovq %rax, %xmm7 > > vmovq %xmm1, -16(%rsp) > > vmovapd %xmm6, %xmm2 > > vpinsrq $1, %rsi, %xmm7, %xmm7 > > vfmadd132pd -24(%rsp), %xmm7, %xmm2 > > vmovapd %xmm2, -56(%rsp) > > vmovsd -48(%rsp), %xmm1 > > vmovsd -56(%rsp), %xmm0 > > ret > > > > with this change, we avoid many of the reloads via memory, > > > > foo: vpunpcklqdq %xmm3, %xmm2, %xmm7 > > vpunpcklqdq %xmm1, %xmm0, %xmm6 > > vpunpcklqdq %xmm5, %xmm4, %xmm2 > > vmovdqa %xmm7, -24(%rsp) > > vmovdqa %xmm6, %xmm1 > > movq -16(%rsp), %rax > > vpinsrq $1, %rax, %xmm7, %xmm4 > > vmovapd %xmm4, %xmm6 > > vfmadd132pd %xmm1, %xmm2, %xmm6 > > vmovapd %xmm6, -24(%rsp) > > vmovsd -16(%rsp), %xmm1 > > vmovsd -24(%rsp), %xmm0 > > ret > > > > > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > > and make -k check, both with and without --target_board=unix{-m32} > > with no new failures. Ok for mainline? > > > > > > 2023-07-19 Roger Sayle <roger@nextmovesoftware.com> > > > > gcc/ChangeLog > > * config/i386/i386-expand.cc (ix86_expand_move): Don't call > > force_reg, to use SUBREG rather than create a new pseudo when > > inserting DFmode fields into TImode with insvti_{high,low}part. > > (*concat<mode><dwi>3_3): Split into two define_insn_and_split... > > (*concatditi3_3): 64-bit implementation. Provide alternative > > that allows register allocation to use SSE registers that is > > split into vec_concatv2di after reload. > > (*concatsidi3_3): 32-bit implementation. > > > > gcc/testsuite/ChangeLog > > * gcc.target/i386/pr88873.c: New test case. > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index f9b0dc6..9c3febe 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx > operands[]) > op0 = SUBREG_REG (op0); > tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); > if (mode == DFmode) > - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); > + op1 = gen_lowpart (DImode, op1); > > Please note that gen_lowpart will ICE when op1 is a SUBREG. This is the reason > that we need to first force a SUBREG to a register and then perform gen_lowpart, > and it is necessary to avoid ICE. The good news is that we know op1 is a register, as this is tested by "&& REG_P (op1)" on line 551. You'll also notice that I'm not removing the force_reg from before the call to gen_lowpart, but removing the call to force_reg after the call to gen_lowpart. When I originally wrote this, the hope was that placing this SUBREG in its own pseudo would help with register allocation/CSE. Unfortunately, increasing the number of pseudos (in this case) increases compile-time (due to quadratic behaviour in LRA), as shown by PR rtl-optimization/110587, and keeping the DF->DI conversion in a SUBREG inside the insvti_{high,low}part allows the register allocator to see the DF->DI->TI sequence in a single pattern, and hence choose to keep the TI mode in SSE registers, rather than use a pair of reloads, to write the DF value to memory, then read it back as a scalar in DImode, and perhaps the same again to go the other way. > op1 = gen_rtx_ZERO_EXTEND (TImode, op1); > op1 = gen_rtx_IOR (TImode, tmp, op1); > } > @@ -570,7 +570,7 @@ ix86_expand_move (machine_mode mode, rtx > operands[]) > op0 = SUBREG_REG (op0); > tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); > if (mode == DFmode) > - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); > + op1 = gen_lowpart (DImode, op1); > > Also here.
On Thu, Jul 20, 2023 at 9:44 AM Roger Sayle <roger@nextmovesoftware.com> wrote: > > > Hi Uros, > > > From: Uros Bizjak <ubizjak@gmail.com> > > Sent: 20 July 2023 07:50 > > > > On Wed, Jul 19, 2023 at 10:07 PM Roger Sayle <roger@nextmovesoftware.com> > > wrote: > > > > > > This patch is the next piece of a solution to the x86_64 ABI issues in > > > PR 88873. This splits the *concat<mode><dwi>3_3 define_insn_and_split > > > into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT > > > *concatsidi3_3. This allows us to add an additional alternative to > > > the the 64-bit version, enabling the register allocator to perform > > > this operation using SSE registers, which is implemented/split after > > > reload using vec_concatv2di. > > > > > > To demonstrate the improvement, the test case from PR88873: > > > > > > typedef struct { double x, y; } s_t; > > > > > > s_t foo (s_t a, s_t b, s_t c) > > > { > > > return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y, > > > c.y) }; } > > > > > > when compiled with -O2 -march=cascadelake, currently generates: > > > > > > foo: vmovq %xmm2, -56(%rsp) > > > movq -56(%rsp), %rax > > > vmovq %xmm3, -48(%rsp) > > > vmovq %xmm4, -40(%rsp) > > > movq -48(%rsp), %rcx > > > vmovq %xmm5, -32(%rsp) > > > vmovq %rax, %xmm6 > > > movq -40(%rsp), %rax > > > movq -32(%rsp), %rsi > > > vpinsrq $1, %rcx, %xmm6, %xmm6 > > > vmovq %xmm0, -24(%rsp) > > > vmovq %rax, %xmm7 > > > vmovq %xmm1, -16(%rsp) > > > vmovapd %xmm6, %xmm2 > > > vpinsrq $1, %rsi, %xmm7, %xmm7 > > > vfmadd132pd -24(%rsp), %xmm7, %xmm2 > > > vmovapd %xmm2, -56(%rsp) > > > vmovsd -48(%rsp), %xmm1 > > > vmovsd -56(%rsp), %xmm0 > > > ret > > > > > > with this change, we avoid many of the reloads via memory, > > > > > > foo: vpunpcklqdq %xmm3, %xmm2, %xmm7 > > > vpunpcklqdq %xmm1, %xmm0, %xmm6 > > > vpunpcklqdq %xmm5, %xmm4, %xmm2 > > > vmovdqa %xmm7, -24(%rsp) > > > vmovdqa %xmm6, %xmm1 > > > movq -16(%rsp), %rax > > > vpinsrq $1, %rax, %xmm7, %xmm4 > > > vmovapd %xmm4, %xmm6 > > > vfmadd132pd %xmm1, %xmm2, %xmm6 > > > vmovapd %xmm6, -24(%rsp) > > > vmovsd -16(%rsp), %xmm1 > > > vmovsd -24(%rsp), %xmm0 > > > ret > > > > > > > > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > > > and make -k check, both with and without --target_board=unix{-m32} > > > with no new failures. Ok for mainline? > > > > > > > > > 2023-07-19 Roger Sayle <roger@nextmovesoftware.com> > > > > > > gcc/ChangeLog > > > * config/i386/i386-expand.cc (ix86_expand_move): Don't call > > > force_reg, to use SUBREG rather than create a new pseudo when > > > inserting DFmode fields into TImode with insvti_{high,low}part. > > > (*concat<mode><dwi>3_3): Split into two define_insn_and_split... > > > (*concatditi3_3): 64-bit implementation. Provide alternative > > > that allows register allocation to use SSE registers that is > > > split into vec_concatv2di after reload. > > > (*concatsidi3_3): 32-bit implementation. > > > > > > gcc/testsuite/ChangeLog > > > * gcc.target/i386/pr88873.c: New test case. > > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > > index f9b0dc6..9c3febe 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx > > operands[]) > > op0 = SUBREG_REG (op0); > > tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); > > if (mode == DFmode) > > - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); > > + op1 = gen_lowpart (DImode, op1); > > > > Please note that gen_lowpart will ICE when op1 is a SUBREG. This is the reason > > that we need to first force a SUBREG to a register and then perform gen_lowpart, > > and it is necessary to avoid ICE. > > The good news is that we know op1 is a register, as this is tested by > "&& REG_P (op1)" on line 551. You'll also notice that I'm not removing > the force_reg from before the call to gen_lowpart, but removing the call > to force_reg after the call to gen_lowpart. When I originally wrote this, > the hope was that placing this SUBREG in its own pseudo would help > with register allocation/CSE. Unfortunately, increasing the number of > pseudos (in this case) increases compile-time (due to quadratic behaviour > in LRA), as shown by PR rtl-optimization/110587, and keeping the DF->DI > conversion in a SUBREG inside the insvti_{high,low}part allows the > register allocator to see the DF->DI->TI sequence in a single pattern, > and hence choose to keep the TI mode in SSE registers, rather than use > a pair of reloads, to write the DF value to memory, then read it back as > a scalar in DImode, and perhaps the same again to go the other way. This was my only concern with the patch, with that cleared, the patch is OK. Thanks, Uros.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index f9b0dc6..9c3febe 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx operands[]) op0 = SUBREG_REG (op0); tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); if (mode == DFmode) - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); + op1 = gen_lowpart (DImode, op1); op1 = gen_rtx_ZERO_EXTEND (TImode, op1); op1 = gen_rtx_IOR (TImode, tmp, op1); } @@ -570,7 +570,7 @@ ix86_expand_move (machine_mode mode, rtx operands[]) op0 = SUBREG_REG (op0); tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp); if (mode == DFmode) - op1 = force_reg (DImode, gen_lowpart (DImode, op1)); + op1 = gen_lowpart (DImode, op1); op1 = gen_rtx_ZERO_EXTEND (TImode, op1); op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64)); op1 = gen_rtx_IOR (TImode, tmp, op1); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 47ea050..8c54aa5 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -12408,21 +12408,47 @@ DONE; }) -(define_insn_and_split "*concat<mode><dwi>3_3" - [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r") - (any_or_plus:<DWI> - (ashift:<DWI> - (zero_extend:<DWI> - (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m")) +(define_insn_and_split "*concatditi3_3" + [(set (match_operand:TI 0 "nonimmediate_operand" "=ro,r,r,&r,x") + (any_or_plus:TI + (ashift:TI + (zero_extend:TI + (match_operand:DI 1 "nonimmediate_operand" "r,m,r,m,x")) (match_operand:QI 2 "const_int_operand")) - (zero_extend:<DWI> - (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))] - "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT" + (zero_extend:TI + (match_operand:DI 3 "nonimmediate_operand" "r,r,m,m,0"))))] + "TARGET_64BIT + && INTVAL (operands[2]) == 64" + "#" + "&& reload_completed" + [(const_int 0)] +{ + if (SSE_REG_P (operands[0])) + { + rtx tmp = gen_rtx_REG (V2DImode, REGNO (operands[0])); + emit_insn (gen_vec_concatv2di (tmp, operands[3], operands[1])); + } + else + split_double_concat (TImode, operands[0], operands[3], operands[1]); + DONE; +}) + +(define_insn_and_split "*concatsidi3_3" + [(set (match_operand:DI 0 "nonimmediate_operand" "=ro,r,r,&r") + (any_or_plus:DI + (ashift:DI + (zero_extend:DI + (match_operand:SI 1 "nonimmediate_operand" "r,m,r,m")) + (match_operand:QI 2 "const_int_operand")) + (zero_extend:DI + (match_operand:SI 3 "nonimmediate_operand" "r,r,m,m"))))] + "!TARGET_64BIT + && INTVAL (operands[2]) == 32" "#" "&& reload_completed" [(const_int 0)] { - split_double_concat (<DWI>mode, operands[0], operands[3], operands[1]); + split_double_concat (DImode, operands[0], operands[3], operands[1]); DONE; })