diff mbox series

[x86_64] More TImode parameter passing improvements.

Message ID 009201d9ba7c$a64374d0$f2ca5e70$@nextmovesoftware.com
State New
Headers show
Series [x86_64] More TImode parameter passing improvements. | expand

Commit Message

Roger Sayle July 19, 2023, 8:07 p.m. UTC
This patch is the next piece of a solution to the x86_64 ABI issues in
PR 88873.  This splits the *concat<mode><dwi>3_3 define_insn_and_split
into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT
*concatsidi3_3.  This allows us to add an additional alternative to the
the 64-bit version, enabling the register allocator to perform this
operation using SSE registers, which is implemented/split after reload
using vec_concatv2di.

To demonstrate the improvement, the test case from PR88873:

typedef struct { double x, y; } s_t;

s_t foo (s_t a, s_t b, s_t c)
{
  return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y, c.y)
};
}

when compiled with -O2 -march=cascadelake, currently generates:

foo:    vmovq   %xmm2, -56(%rsp)
        movq    -56(%rsp), %rax
        vmovq   %xmm3, -48(%rsp)
        vmovq   %xmm4, -40(%rsp)
        movq    -48(%rsp), %rcx
        vmovq   %xmm5, -32(%rsp)
        vmovq   %rax, %xmm6
        movq    -40(%rsp), %rax
        movq    -32(%rsp), %rsi
        vpinsrq $1, %rcx, %xmm6, %xmm6
        vmovq   %xmm0, -24(%rsp)
        vmovq   %rax, %xmm7
        vmovq   %xmm1, -16(%rsp)
        vmovapd %xmm6, %xmm2
        vpinsrq $1, %rsi, %xmm7, %xmm7
        vfmadd132pd     -24(%rsp), %xmm7, %xmm2
        vmovapd %xmm2, -56(%rsp)
        vmovsd  -48(%rsp), %xmm1
        vmovsd  -56(%rsp), %xmm0
        ret

with this change, we avoid many of the reloads via memory,

foo:    vpunpcklqdq     %xmm3, %xmm2, %xmm7
        vpunpcklqdq     %xmm1, %xmm0, %xmm6
        vpunpcklqdq     %xmm5, %xmm4, %xmm2
        vmovdqa %xmm7, -24(%rsp)
        vmovdqa %xmm6, %xmm1
        movq    -16(%rsp), %rax
        vpinsrq $1, %rax, %xmm7, %xmm4
        vmovapd %xmm4, %xmm6
        vfmadd132pd     %xmm1, %xmm2, %xmm6
        vmovapd %xmm6, -24(%rsp)
        vmovsd  -16(%rsp), %xmm1
        vmovsd  -24(%rsp), %xmm0
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-07-19  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386-expand.cc (ix86_expand_move): Don't call
        force_reg, to use SUBREG rather than create a new pseudo when
        inserting DFmode fields into TImode with insvti_{high,low}part.
        (*concat<mode><dwi>3_3): Split into two define_insn_and_split...
        (*concatditi3_3): 64-bit implementation.  Provide alternative
        that allows register allocation to use SSE registers that is
        split into vec_concatv2di after reload.
        (*concatsidi3_3): 32-bit implementation.

gcc/testsuite/ChangeLog
        * gcc.target/i386/pr88873.c: New test case.


Thanks in advance,
Roger
--

Comments

Uros Bizjak July 20, 2023, 6:49 a.m. UTC | #1
On Wed, Jul 19, 2023 at 10:07 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch is the next piece of a solution to the x86_64 ABI issues in
> PR 88873.  This splits the *concat<mode><dwi>3_3 define_insn_and_split
> into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT
> *concatsidi3_3.  This allows us to add an additional alternative to the
> the 64-bit version, enabling the register allocator to perform this
> operation using SSE registers, which is implemented/split after reload
> using vec_concatv2di.
>
> To demonstrate the improvement, the test case from PR88873:
>
> typedef struct { double x, y; } s_t;
>
> s_t foo (s_t a, s_t b, s_t c)
> {
>   return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y, c.y)
> };
> }
>
> when compiled with -O2 -march=cascadelake, currently generates:
>
> foo:    vmovq   %xmm2, -56(%rsp)
>         movq    -56(%rsp), %rax
>         vmovq   %xmm3, -48(%rsp)
>         vmovq   %xmm4, -40(%rsp)
>         movq    -48(%rsp), %rcx
>         vmovq   %xmm5, -32(%rsp)
>         vmovq   %rax, %xmm6
>         movq    -40(%rsp), %rax
>         movq    -32(%rsp), %rsi
>         vpinsrq $1, %rcx, %xmm6, %xmm6
>         vmovq   %xmm0, -24(%rsp)
>         vmovq   %rax, %xmm7
>         vmovq   %xmm1, -16(%rsp)
>         vmovapd %xmm6, %xmm2
>         vpinsrq $1, %rsi, %xmm7, %xmm7
>         vfmadd132pd     -24(%rsp), %xmm7, %xmm2
>         vmovapd %xmm2, -56(%rsp)
>         vmovsd  -48(%rsp), %xmm1
>         vmovsd  -56(%rsp), %xmm0
>         ret
>
> with this change, we avoid many of the reloads via memory,
>
> foo:    vpunpcklqdq     %xmm3, %xmm2, %xmm7
>         vpunpcklqdq     %xmm1, %xmm0, %xmm6
>         vpunpcklqdq     %xmm5, %xmm4, %xmm2
>         vmovdqa %xmm7, -24(%rsp)
>         vmovdqa %xmm6, %xmm1
>         movq    -16(%rsp), %rax
>         vpinsrq $1, %rax, %xmm7, %xmm4
>         vmovapd %xmm4, %xmm6
>         vfmadd132pd     %xmm1, %xmm2, %xmm6
>         vmovapd %xmm6, -24(%rsp)
>         vmovsd  -16(%rsp), %xmm1
>         vmovsd  -24(%rsp), %xmm0
>         ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2023-07-19  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_move): Don't call
>         force_reg, to use SUBREG rather than create a new pseudo when
>         inserting DFmode fields into TImode with insvti_{high,low}part.
>         (*concat<mode><dwi>3_3): Split into two define_insn_and_split...
>         (*concatditi3_3): 64-bit implementation.  Provide alternative
>         that allows register allocation to use SSE registers that is
>         split into vec_concatv2di after reload.
>         (*concatsidi3_3): 32-bit implementation.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/pr88873.c: New test case.

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index f9b0dc6..9c3febe 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
   op0 = SUBREG_REG (op0);
   tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
   if (mode == DFmode)
-    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+    op1 = gen_lowpart (DImode, op1);

Please note that gen_lowpart will ICE when op1 is a SUBREG. This is
the reason that we need to first force a SUBREG to a register and then
perform gen_lowpart, and it is necessary to avoid ICE.

   op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
   op1 = gen_rtx_IOR (TImode, tmp, op1);
  }
@@ -570,7 +570,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
   op0 = SUBREG_REG (op0);
   tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
   if (mode == DFmode)
-    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+    op1 = gen_lowpart (DImode, op1);

Also here.

   op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
   op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
   op1 = gen_rtx_IOR (TImode, tmp, op1);

Uros.
Roger Sayle July 20, 2023, 7:44 a.m. UTC | #2
Hi Uros,

> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 20 July 2023 07:50
> 
> On Wed, Jul 19, 2023 at 10:07 PM Roger Sayle <roger@nextmovesoftware.com>
> wrote:
> >
> > This patch is the next piece of a solution to the x86_64 ABI issues in
> > PR 88873.  This splits the *concat<mode><dwi>3_3 define_insn_and_split
> > into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT
> > *concatsidi3_3.  This allows us to add an additional alternative to
> > the the 64-bit version, enabling the register allocator to perform
> > this operation using SSE registers, which is implemented/split after
> > reload using vec_concatv2di.
> >
> > To demonstrate the improvement, the test case from PR88873:
> >
> > typedef struct { double x, y; } s_t;
> >
> > s_t foo (s_t a, s_t b, s_t c)
> > {
> >   return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y,
> > c.y) }; }
> >
> > when compiled with -O2 -march=cascadelake, currently generates:
> >
> > foo:    vmovq   %xmm2, -56(%rsp)
> >         movq    -56(%rsp), %rax
> >         vmovq   %xmm3, -48(%rsp)
> >         vmovq   %xmm4, -40(%rsp)
> >         movq    -48(%rsp), %rcx
> >         vmovq   %xmm5, -32(%rsp)
> >         vmovq   %rax, %xmm6
> >         movq    -40(%rsp), %rax
> >         movq    -32(%rsp), %rsi
> >         vpinsrq $1, %rcx, %xmm6, %xmm6
> >         vmovq   %xmm0, -24(%rsp)
> >         vmovq   %rax, %xmm7
> >         vmovq   %xmm1, -16(%rsp)
> >         vmovapd %xmm6, %xmm2
> >         vpinsrq $1, %rsi, %xmm7, %xmm7
> >         vfmadd132pd     -24(%rsp), %xmm7, %xmm2
> >         vmovapd %xmm2, -56(%rsp)
> >         vmovsd  -48(%rsp), %xmm1
> >         vmovsd  -56(%rsp), %xmm0
> >         ret
> >
> > with this change, we avoid many of the reloads via memory,
> >
> > foo:    vpunpcklqdq     %xmm3, %xmm2, %xmm7
> >         vpunpcklqdq     %xmm1, %xmm0, %xmm6
> >         vpunpcklqdq     %xmm5, %xmm4, %xmm2
> >         vmovdqa %xmm7, -24(%rsp)
> >         vmovdqa %xmm6, %xmm1
> >         movq    -16(%rsp), %rax
> >         vpinsrq $1, %rax, %xmm7, %xmm4
> >         vmovapd %xmm4, %xmm6
> >         vfmadd132pd     %xmm1, %xmm2, %xmm6
> >         vmovapd %xmm6, -24(%rsp)
> >         vmovsd  -16(%rsp), %xmm1
> >         vmovsd  -24(%rsp), %xmm0
> >         ret
> >
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32}
> > with no new failures.  Ok for mainline?
> >
> >
> > 2023-07-19  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         * config/i386/i386-expand.cc (ix86_expand_move): Don't call
> >         force_reg, to use SUBREG rather than create a new pseudo when
> >         inserting DFmode fields into TImode with insvti_{high,low}part.
> >         (*concat<mode><dwi>3_3): Split into two define_insn_and_split...
> >         (*concatditi3_3): 64-bit implementation.  Provide alternative
> >         that allows register allocation to use SSE registers that is
> >         split into vec_concatv2di after reload.
> >         (*concatsidi3_3): 32-bit implementation.
> >
> > gcc/testsuite/ChangeLog
> >         * gcc.target/i386/pr88873.c: New test case.
> 
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index f9b0dc6..9c3febe 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx
> operands[])
>    op0 = SUBREG_REG (op0);
>    tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
>    if (mode == DFmode)
> -    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
> +    op1 = gen_lowpart (DImode, op1);
> 
> Please note that gen_lowpart will ICE when op1 is a SUBREG. This is the reason
> that we need to first force a SUBREG to a register and then perform gen_lowpart,
> and it is necessary to avoid ICE.

The good news is that we know op1 is a register, as this is tested by
"&& REG_P (op1)" on line 551.  You'll also notice that I'm not removing
the force_reg from before the call to gen_lowpart, but removing the call
to force_reg after the call to gen_lowpart.  When I originally wrote this,
the hope was that placing this SUBREG in its own pseudo would help
with register allocation/CSE.  Unfortunately, increasing the number of
pseudos (in this case) increases compile-time (due to quadratic behaviour
in LRA), as shown by PR rtl-optimization/110587, and keeping the DF->DI
conversion in a SUBREG inside the insvti_{high,low}part allows the
register allocator to see the DF->DI->TI sequence in a single pattern,
and hence choose to keep the TI mode in SSE registers, rather than use
a pair of reloads, to write the DF value to memory, then read it back as
a scalar in DImode, and perhaps the same again to go the other way.

>    op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
>    op1 = gen_rtx_IOR (TImode, tmp, op1);
>   }
> @@ -570,7 +570,7 @@ ix86_expand_move (machine_mode mode, rtx
> operands[])
>    op0 = SUBREG_REG (op0);
>    tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
>    if (mode == DFmode)
> -    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
> +    op1 = gen_lowpart (DImode, op1);
> 
> Also here.
Uros Bizjak July 20, 2023, 7:49 a.m. UTC | #3
On Thu, Jul 20, 2023 at 9:44 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Hi Uros,
>
> > From: Uros Bizjak <ubizjak@gmail.com>
> > Sent: 20 July 2023 07:50
> >
> > On Wed, Jul 19, 2023 at 10:07 PM Roger Sayle <roger@nextmovesoftware.com>
> > wrote:
> > >
> > > This patch is the next piece of a solution to the x86_64 ABI issues in
> > > PR 88873.  This splits the *concat<mode><dwi>3_3 define_insn_and_split
> > > into two patterns, a TARGET_64BIT *concatditi3_3 and a !TARGET_64BIT
> > > *concatsidi3_3.  This allows us to add an additional alternative to
> > > the the 64-bit version, enabling the register allocator to perform
> > > this operation using SSE registers, which is implemented/split after
> > > reload using vec_concatv2di.
> > >
> > > To demonstrate the improvement, the test case from PR88873:
> > >
> > > typedef struct { double x, y; } s_t;
> > >
> > > s_t foo (s_t a, s_t b, s_t c)
> > > {
> > >   return (s_t){ __builtin_fma(a.x, b.x, c.x), __builtin_fma (a.y, b.y,
> > > c.y) }; }
> > >
> > > when compiled with -O2 -march=cascadelake, currently generates:
> > >
> > > foo:    vmovq   %xmm2, -56(%rsp)
> > >         movq    -56(%rsp), %rax
> > >         vmovq   %xmm3, -48(%rsp)
> > >         vmovq   %xmm4, -40(%rsp)
> > >         movq    -48(%rsp), %rcx
> > >         vmovq   %xmm5, -32(%rsp)
> > >         vmovq   %rax, %xmm6
> > >         movq    -40(%rsp), %rax
> > >         movq    -32(%rsp), %rsi
> > >         vpinsrq $1, %rcx, %xmm6, %xmm6
> > >         vmovq   %xmm0, -24(%rsp)
> > >         vmovq   %rax, %xmm7
> > >         vmovq   %xmm1, -16(%rsp)
> > >         vmovapd %xmm6, %xmm2
> > >         vpinsrq $1, %rsi, %xmm7, %xmm7
> > >         vfmadd132pd     -24(%rsp), %xmm7, %xmm2
> > >         vmovapd %xmm2, -56(%rsp)
> > >         vmovsd  -48(%rsp), %xmm1
> > >         vmovsd  -56(%rsp), %xmm0
> > >         ret
> > >
> > > with this change, we avoid many of the reloads via memory,
> > >
> > > foo:    vpunpcklqdq     %xmm3, %xmm2, %xmm7
> > >         vpunpcklqdq     %xmm1, %xmm0, %xmm6
> > >         vpunpcklqdq     %xmm5, %xmm4, %xmm2
> > >         vmovdqa %xmm7, -24(%rsp)
> > >         vmovdqa %xmm6, %xmm1
> > >         movq    -16(%rsp), %rax
> > >         vpinsrq $1, %rax, %xmm7, %xmm4
> > >         vmovapd %xmm4, %xmm6
> > >         vfmadd132pd     %xmm1, %xmm2, %xmm6
> > >         vmovapd %xmm6, -24(%rsp)
> > >         vmovsd  -16(%rsp), %xmm1
> > >         vmovsd  -24(%rsp), %xmm0
> > >         ret
> > >
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > > and make -k check, both with and without --target_board=unix{-m32}
> > > with no new failures.  Ok for mainline?
> > >
> > >
> > > 2023-07-19  Roger Sayle  <roger@nextmovesoftware.com>
> > >
> > > gcc/ChangeLog
> > >         * config/i386/i386-expand.cc (ix86_expand_move): Don't call
> > >         force_reg, to use SUBREG rather than create a new pseudo when
> > >         inserting DFmode fields into TImode with insvti_{high,low}part.
> > >         (*concat<mode><dwi>3_3): Split into two define_insn_and_split...
> > >         (*concatditi3_3): 64-bit implementation.  Provide alternative
> > >         that allows register allocation to use SSE registers that is
> > >         split into vec_concatv2di after reload.
> > >         (*concatsidi3_3): 32-bit implementation.
> > >
> > > gcc/testsuite/ChangeLog
> > >         * gcc.target/i386/pr88873.c: New test case.
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index f9b0dc6..9c3febe 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -558,7 +558,7 @@ ix86_expand_move (machine_mode mode, rtx
> > operands[])
> >    op0 = SUBREG_REG (op0);
> >    tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
> >    if (mode == DFmode)
> > -    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
> > +    op1 = gen_lowpart (DImode, op1);
> >
> > Please note that gen_lowpart will ICE when op1 is a SUBREG. This is the reason
> > that we need to first force a SUBREG to a register and then perform gen_lowpart,
> > and it is necessary to avoid ICE.
>
> The good news is that we know op1 is a register, as this is tested by
> "&& REG_P (op1)" on line 551.  You'll also notice that I'm not removing
> the force_reg from before the call to gen_lowpart, but removing the call
> to force_reg after the call to gen_lowpart.  When I originally wrote this,
> the hope was that placing this SUBREG in its own pseudo would help
> with register allocation/CSE.  Unfortunately, increasing the number of
> pseudos (in this case) increases compile-time (due to quadratic behaviour
> in LRA), as shown by PR rtl-optimization/110587, and keeping the DF->DI
> conversion in a SUBREG inside the insvti_{high,low}part allows the
> register allocator to see the DF->DI->TI sequence in a single pattern,
> and hence choose to keep the TI mode in SSE registers, rather than use
> a pair of reloads, to write the DF value to memory, then read it back as
> a scalar in DImode, and perhaps the same again to go the other way.

This was my only concern with the patch, with that cleared, the patch is OK.

Thanks,
Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index f9b0dc6..9c3febe 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -558,7 +558,7 @@  ix86_expand_move (machine_mode mode, rtx operands[])
 	  op0 = SUBREG_REG (op0);
 	  tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
 	  if (mode == DFmode)
-	    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+	    op1 = gen_lowpart (DImode, op1);
 	  op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
 	  op1 = gen_rtx_IOR (TImode, tmp, op1);
 	}
@@ -570,7 +570,7 @@  ix86_expand_move (machine_mode mode, rtx operands[])
 	  op0 = SUBREG_REG (op0);
 	  tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
 	  if (mode == DFmode)
-	    op1 = force_reg (DImode, gen_lowpart (DImode, op1));
+	    op1 = gen_lowpart (DImode, op1);
 	  op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
 	  op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
 	  op1 = gen_rtx_IOR (TImode, tmp, op1);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 47ea050..8c54aa5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12408,21 +12408,47 @@ 
   DONE;
 })
 
-(define_insn_and_split "*concat<mode><dwi>3_3"
-  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro,r,r,&r")
-	(any_or_plus:<DWI>
-	  (ashift:<DWI>
-	    (zero_extend:<DWI>
-	      (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
+(define_insn_and_split "*concatditi3_3"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=ro,r,r,&r,x")
+	(any_or_plus:TI
+	  (ashift:TI
+	    (zero_extend:TI
+	      (match_operand:DI 1 "nonimmediate_operand" "r,m,r,m,x"))
 	    (match_operand:QI 2 "const_int_operand"))
-	  (zero_extend:<DWI>
-	    (match_operand:DWIH 3 "nonimmediate_operand" "r,r,m,m"))))]
-  "INTVAL (operands[2]) == <MODE_SIZE> * BITS_PER_UNIT"
+	  (zero_extend:TI
+	    (match_operand:DI 3 "nonimmediate_operand" "r,r,m,m,0"))))]
+  "TARGET_64BIT
+   && INTVAL (operands[2]) == 64"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (SSE_REG_P (operands[0]))
+    {
+      rtx tmp = gen_rtx_REG (V2DImode, REGNO (operands[0]));
+      emit_insn (gen_vec_concatv2di (tmp, operands[3], operands[1]));
+    }
+  else
+    split_double_concat (TImode, operands[0], operands[3], operands[1]);
+  DONE;
+})
+
+(define_insn_and_split "*concatsidi3_3"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=ro,r,r,&r")
+	(any_or_plus:DI
+	  (ashift:DI
+	    (zero_extend:DI
+	      (match_operand:SI 1 "nonimmediate_operand" "r,m,r,m"))
+	    (match_operand:QI 2 "const_int_operand"))
+	  (zero_extend:DI
+	    (match_operand:SI 3 "nonimmediate_operand" "r,r,m,m"))))]
+  "!TARGET_64BIT
+   && INTVAL (operands[2]) == 32"
   "#"
   "&& reload_completed"
   [(const_int 0)]
 {
-  split_double_concat (<DWI>mode, operands[0], operands[3], operands[1]);
+  split_double_concat (DImode, operands[0], operands[3], operands[1]);
   DONE;
 })