diff mbox series

Enable more optimization for 32-bit/64-bit shrd/shld with imm shift count.

Message ID 20221031012310.1237451-1-hongtao.liu@intel.com
State New
Headers show
Series Enable more optimization for 32-bit/64-bit shrd/shld with imm shift count. | expand

Commit Message

liuhongt Oct. 31, 2022, 1:23 a.m. UTC
This patch doens't handle variable count since it require 5 insns to
be combined to get wanted pattern, but current pass_combine only
supports at most 4.
This patch doesn't handle 16-bit shrd/shld either.

Ideally, we can avoid redundancy of
*x86_64_shld_shrd_1_nozext/*x86_shld_shrd_1_nozext
if middle end could recognize they're just variants of the
*x86_64_shrd_shld_1_nozext/*x86_shrd_shld_1_nozext with ashift/lshiftrt swapped
in the ior which is commutative. But currently it doesn't, so I add both of
them in the patch.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


gcc/ChangeLog:

	PR target/55583
	* config/i386/i386.md (*x86_64_shld_1): Rename to ..
	(x86_64_shld_1): .. this.
	(*x86_shld_1): Rename to ..
	(x86_shld_1): .. this.
	(*x86_64_shrd_1): Rename to ..
	(x86_64_shrd_1): .. this.
	(*x86_shrd_1): Rename to ..
	(x86_shrd_1): .. this.
	(*x86_64_shld_shrd_1_nozext): New pre_reload splitter.
	(*x86_shld_shrd_1_nozext): Ditto.
	(*x86_64_shrd_shld_1_nozext): Ditto.
	(*x86_shrd_shld_1_nozext): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr55583.c: New test.
---
 gcc/config/i386/i386.md                 | 150 +++++++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/pr55583.c |  27 +++++
 2 files changed, 173 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr55583.c

Comments

Uros Bizjak Oct. 31, 2022, 9:41 a.m. UTC | #1
On Mon, Oct 31, 2022 at 2:25 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> This patch doens't handle variable count since it require 5 insns to
> be combined to get wanted pattern, but current pass_combine only
> supports at most 4.
> This patch doesn't handle 16-bit shrd/shld either.
>
> Ideally, we can avoid redundancy of
> *x86_64_shld_shrd_1_nozext/*x86_shld_shrd_1_nozext
> if middle end could recognize they're just variants of the
> *x86_64_shrd_shld_1_nozext/*x86_shrd_shld_1_nozext with ashift/lshiftrt swapped
> in the ior which is commutative. But currently it doesn't, so I add both of
> them in the patch.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
>
> gcc/ChangeLog:
>
>         PR target/55583
>         * config/i386/i386.md (*x86_64_shld_1): Rename to ..
>         (x86_64_shld_1): .. this.
>         (*x86_shld_1): Rename to ..
>         (x86_shld_1): .. this.
>         (*x86_64_shrd_1): Rename to ..
>         (x86_64_shrd_1): .. this.
>         (*x86_shrd_1): Rename to ..
>         (x86_shrd_1): .. this.
>         (*x86_64_shld_shrd_1_nozext): New pre_reload splitter.
>         (*x86_shld_shrd_1_nozext): Ditto.
>         (*x86_64_shrd_shld_1_nozext): Ditto.
>         (*x86_shrd_shld_1_nozext): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr55583.c: New test.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.md                 | 150 +++++++++++++++++++++++-
>  gcc/testsuite/gcc.target/i386/pr55583.c |  27 +++++
>  2 files changed, 173 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr55583.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index baf1f1f8fa2..a3ac319f0d7 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -12470,7 +12470,7 @@ (define_insn "x86_64_shld"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> -(define_insn "*x86_64_shld_1"
> +(define_insn "x86_64_shld_1"
>    [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
>          (ior:DI (ashift:DI (match_dup 0)
>                            (match_operand:QI 2 "const_0_to_63_operand"))
> @@ -12491,6 +12491,42 @@ (define_insn "*x86_64_shld_1"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> +(define_insn_and_split "*x86_64_shld_shrd_1_nozext"
> +  [(set (match_operand:DI 0 "nonimmediate_operand")
> +       (ior:DI (ashift:DI (match_operand:DI 4 "nonimmediate_operand")
> +                            (match_operand:QI 2 "const_0_to_63_operand"))
> +               (lshiftrt:DI
> +                 (match_operand:DI 1 "nonimmediate_operand")
> +                 (match_operand:QI 3 "const_0_to_63_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_64BIT
> +   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
> +   && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  if (rtx_equal_p (operands[4], operands[0]))
> +    {
> +      operands[1] = force_reg (DImode, operands[1]);
> +      emit_insn (gen_x86_64_shld_1 (operands[0], operands[1], operands[2], operands[3]));
> +    }
> +  else if (rtx_equal_p (operands[1], operands[0]))
> +    {
> +      operands[4] = force_reg (DImode, operands[4]);
> +      emit_insn (gen_x86_64_shrd_1 (operands[0], operands[4], operands[3], operands[2]));
> +    }
> +  else
> +   {
> +     operands[1] = force_reg (DImode, operands[1]);
> +     rtx tmp = gen_reg_rtx (DImode);
> +     emit_move_insn (tmp, operands[4]);
> +     emit_insn (gen_x86_64_shld_1 (tmp, operands[1], operands[2], operands[3]));
> +     emit_move_insn (operands[0], tmp);
> +   }
> +   DONE;
> +})
> +
>  (define_insn_and_split "*x86_64_shld_2"
>    [(set (match_operand:DI 0 "nonimmediate_operand")
>         (ior:DI (ashift:DI (match_dup 0)
> @@ -12534,7 +12570,7 @@ (define_insn "x86_shld"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> -(define_insn "*x86_shld_1"
> +(define_insn "x86_shld_1"
>    [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
>          (ior:SI (ashift:SI (match_dup 0)
>                            (match_operand:QI 2 "const_0_to_31_operand"))
> @@ -12555,6 +12591,41 @@ (define_insn "*x86_shld_1"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> +(define_insn_and_split "*x86_shld_shrd_1_nozext"
> +  [(set (match_operand:SI 0 "nonimmediate_operand")
> +       (ior:SI (ashift:SI (match_operand:SI 4 "nonimmediate_operand")
> +                            (match_operand:QI 2 "const_0_to_31_operand"))
> +              (lshiftrt:SI
> +                  (match_operand:SI 1 "nonimmediate_operand")
> +                  (match_operand:QI 3 "const_0_to_31_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "INTVAL (operands[3]) == 32 - INTVAL (operands[2])
> +   && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  if (rtx_equal_p (operands[4], operands[0]))
> +    {
> +      operands[1] = force_reg (SImode, operands[1]);
> +      emit_insn (gen_x86_shld_1 (operands[0], operands[1], operands[2], operands[3]));
> +    }
> +  else if (rtx_equal_p (operands[1], operands[0]))
> +    {
> +      operands[4] = force_reg (SImode, operands[4]);
> +      emit_insn (gen_x86_shrd_1 (operands[0], operands[4], operands[3], operands[2]));
> +    }
> +  else
> +   {
> +     operands[1] = force_reg (SImode, operands[1]);
> +     rtx tmp = gen_reg_rtx (SImode);
> +     emit_move_insn (tmp, operands[4]);
> +     emit_insn (gen_x86_shld_1 (tmp, operands[1], operands[2], operands[3]));
> +     emit_move_insn (operands[0], tmp);
> +   }
> +   DONE;
> +})
> +
>  (define_insn_and_split "*x86_shld_2"
>    [(set (match_operand:SI 0 "nonimmediate_operand")
>         (ior:SI (ashift:SI (match_dup 0)
> @@ -13433,7 +13504,7 @@ (define_insn "x86_64_shrd"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> -(define_insn "*x86_64_shrd_1"
> +(define_insn "x86_64_shrd_1"
>    [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
>          (ior:DI (lshiftrt:DI (match_dup 0)
>                              (match_operand:QI 2 "const_0_to_63_operand"))
> @@ -13454,6 +13525,42 @@ (define_insn "*x86_64_shrd_1"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> +(define_insn_and_split "*x86_64_shrd_shld_1_nozext"
> +  [(set (match_operand:DI 0 "nonimmediate_operand")
> +       (ior:DI (lshiftrt:DI (match_operand:DI 4 "nonimmediate_operand")
> +                            (match_operand:QI 2 "const_0_to_63_operand"))
> +               (ashift:DI
> +                 (match_operand:DI 1 "nonimmediate_operand")
> +                 (match_operand:QI 3 "const_0_to_63_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_64BIT
> +   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
> +   && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  if (rtx_equal_p (operands[4], operands[0]))
> +    {
> +      operands[1] = force_reg (DImode, operands[1]);
> +      emit_insn (gen_x86_64_shrd_1 (operands[0], operands[1], operands[2], operands[3]));
> +    }
> +  else if (rtx_equal_p (operands[1], operands[0]))
> +    {
> +      operands[4] = force_reg (DImode, operands[4]);
> +      emit_insn (gen_x86_64_shld_1 (operands[0], operands[4], operands[3], operands[2]));
> +    }
> +  else
> +   {
> +     operands[1] = force_reg (DImode, operands[1]);
> +     rtx tmp = gen_reg_rtx (DImode);
> +     emit_move_insn (tmp, operands[4]);
> +     emit_insn (gen_x86_64_shrd_1 (tmp, operands[1], operands[2], operands[3]));
> +     emit_move_insn (operands[0], tmp);
> +   }
> +   DONE;
> +})
> +
>  (define_insn_and_split "*x86_64_shrd_2"
>    [(set (match_operand:DI 0 "nonimmediate_operand")
>         (ior:DI (lshiftrt:DI (match_dup 0)
> @@ -13497,7 +13604,7 @@ (define_insn "x86_shrd"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> -(define_insn "*x86_shrd_1"
> +(define_insn "x86_shrd_1"
>    [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
>          (ior:SI (lshiftrt:SI (match_dup 0)
>                              (match_operand:QI 2 "const_0_to_31_operand"))
> @@ -13518,6 +13625,41 @@ (define_insn "*x86_shrd_1"
>     (set_attr "amdfam10_decode" "vector")
>     (set_attr "bdver1_decode" "vector")])
>
> +(define_insn_and_split "*x86_shrd_shld_1_nozext"
> +  [(set (match_operand:SI 0 "nonimmediate_operand")
> +       (ior:SI (lshiftrt:SI (match_operand:SI 4 "nonimmediate_operand")
> +                            (match_operand:QI 2 "const_0_to_31_operand"))
> +              (ashift:SI
> +                  (match_operand:SI 1 "nonimmediate_operand")
> +                  (match_operand:QI 3 "const_0_to_31_operand"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "INTVAL (operands[3]) == 32 - INTVAL (operands[2])
> +   && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  if (rtx_equal_p (operands[4], operands[0]))
> +    {
> +      operands[1] = force_reg (SImode, operands[1]);
> +      emit_insn (gen_x86_shrd_1 (operands[0], operands[1], operands[2], operands[3]));
> +    }
> +  else if (rtx_equal_p (operands[1], operands[0]))
> +    {
> +      operands[4] = force_reg (SImode, operands[4]);
> +      emit_insn (gen_x86_shld_1 (operands[0], operands[4], operands[3], operands[2]));
> +    }
> +  else
> +   {
> +     operands[1] = force_reg (SImode, operands[1]);
> +     rtx tmp = gen_reg_rtx (SImode);
> +     emit_move_insn (tmp, operands[4]);
> +     emit_insn (gen_x86_shrd_1 (tmp, operands[1], operands[2], operands[3]));
> +     emit_move_insn (operands[0], tmp);
> +   }
> +   DONE;
> +})
> +
>  (define_insn_and_split "*x86_shrd_2"
>    [(set (match_operand:SI 0 "nonimmediate_operand")
>         (ior:SI (lshiftrt:SI (match_dup 0)
> diff --git a/gcc/testsuite/gcc.target/i386/pr55583.c b/gcc/testsuite/gcc.target/i386/pr55583.c
> new file mode 100644
> index 00000000000..1c128b5d929
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr55583.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile  } */
> +/* { dg-options "-O2 -Wno-shift-count-overflow" } */
> +/* { dg-final { scan-assembler-times {(?n)shrd[ql]?[\t ]*\$2} 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times {(?n)shrdl?[\t ]*\$2} 2 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times {(?n)shldl?[\t ]*\$2} 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times {(?n)shld[ql]?[\t ]*\$2} 2 { target { ! ia32 } } } } */
> +
> +typedef unsigned long  u64;
> +typedef unsigned int   u32;
> +typedef unsigned short u16;
> +
> +long  a, b;
> +int   c, d;
> +short e, f;
> +const int n = 2;
> +
> +void test64r () { b = ((u64)b >> n) | (a << (64 - n)); }
> +void test32r () { d = ((u32)d >> n) | (c << (32 - n)); }
> +
> +unsigned long  ua, ub;
> +unsigned int   uc, ud;
> +unsigned short ue, uf;
> +
> +void testu64l () { ub = (ub << n) | (ua >> (64 - n)); }
> +void testu64r () { ub = (ub >> n) | (ua << (64 - n)); }
> +void testu32l () { ud = (ud << n) | (uc >> (32 - n)); }
> +void testu32r () { ud = (ud >> n) | (uc << (32 - n)); }
> --
> 2.27.0
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index baf1f1f8fa2..a3ac319f0d7 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12470,7 +12470,7 @@  (define_insn "x86_64_shld"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
-(define_insn "*x86_64_shld_1"
+(define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
         (ior:DI (ashift:DI (match_dup 0)
 			   (match_operand:QI 2 "const_0_to_63_operand"))
@@ -12491,6 +12491,42 @@  (define_insn "*x86_64_shld_1"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+	(ior:DI (ashift:DI (match_operand:DI 4 "nonimmediate_operand")
+			     (match_operand:QI 2 "const_0_to_63_operand"))
+		(lshiftrt:DI
+		  (match_operand:DI 1 "nonimmediate_operand")
+		  (match_operand:QI 3 "const_0_to_63_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (rtx_equal_p (operands[4], operands[0]))
+    {
+      operands[1] = force_reg (DImode, operands[1]);
+      emit_insn (gen_x86_64_shld_1 (operands[0], operands[1], operands[2], operands[3]));
+    }
+  else if (rtx_equal_p (operands[1], operands[0]))
+    {
+      operands[4] = force_reg (DImode, operands[4]);
+      emit_insn (gen_x86_64_shrd_1 (operands[0], operands[4], operands[3], operands[2]));
+    }
+  else
+   {
+     operands[1] = force_reg (DImode, operands[1]);
+     rtx tmp = gen_reg_rtx (DImode);
+     emit_move_insn (tmp, operands[4]);
+     emit_insn (gen_x86_64_shld_1 (tmp, operands[1], operands[2], operands[3]));
+     emit_move_insn (operands[0], tmp);
+   }
+   DONE;
+})
+
 (define_insn_and_split "*x86_64_shld_2"
   [(set (match_operand:DI 0 "nonimmediate_operand")
 	(ior:DI (ashift:DI (match_dup 0)
@@ -12534,7 +12570,7 @@  (define_insn "x86_shld"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
-(define_insn "*x86_shld_1"
+(define_insn "x86_shld_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
         (ior:SI (ashift:SI (match_dup 0)
 			   (match_operand:QI 2 "const_0_to_31_operand"))
@@ -12555,6 +12591,41 @@  (define_insn "*x86_shld_1"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
+(define_insn_and_split "*x86_shld_shrd_1_nozext"
+  [(set (match_operand:SI 0 "nonimmediate_operand")
+	(ior:SI (ashift:SI (match_operand:SI 4 "nonimmediate_operand")
+			     (match_operand:QI 2 "const_0_to_31_operand"))
+	       (lshiftrt:SI
+		   (match_operand:SI 1 "nonimmediate_operand")
+		   (match_operand:QI 3 "const_0_to_31_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[3]) == 32 - INTVAL (operands[2])
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (rtx_equal_p (operands[4], operands[0]))
+    {
+      operands[1] = force_reg (SImode, operands[1]);
+      emit_insn (gen_x86_shld_1 (operands[0], operands[1], operands[2], operands[3]));
+    }
+  else if (rtx_equal_p (operands[1], operands[0]))
+    {
+      operands[4] = force_reg (SImode, operands[4]);
+      emit_insn (gen_x86_shrd_1 (operands[0], operands[4], operands[3], operands[2]));
+    }
+  else
+   {
+     operands[1] = force_reg (SImode, operands[1]);
+     rtx tmp = gen_reg_rtx (SImode);
+     emit_move_insn (tmp, operands[4]);
+     emit_insn (gen_x86_shld_1 (tmp, operands[1], operands[2], operands[3]));
+     emit_move_insn (operands[0], tmp);
+   }
+   DONE;
+})
+
 (define_insn_and_split "*x86_shld_2"
   [(set (match_operand:SI 0 "nonimmediate_operand")
 	(ior:SI (ashift:SI (match_dup 0)
@@ -13433,7 +13504,7 @@  (define_insn "x86_64_shrd"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
-(define_insn "*x86_64_shrd_1"
+(define_insn "x86_64_shrd_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
         (ior:DI (lshiftrt:DI (match_dup 0)
 			     (match_operand:QI 2 "const_0_to_63_operand"))
@@ -13454,6 +13525,42 @@  (define_insn "*x86_64_shrd_1"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
+(define_insn_and_split "*x86_64_shrd_shld_1_nozext"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+	(ior:DI (lshiftrt:DI (match_operand:DI 4 "nonimmediate_operand")
+			     (match_operand:QI 2 "const_0_to_63_operand"))
+		(ashift:DI
+		  (match_operand:DI 1 "nonimmediate_operand")
+		  (match_operand:QI 3 "const_0_to_63_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (rtx_equal_p (operands[4], operands[0]))
+    {
+      operands[1] = force_reg (DImode, operands[1]);
+      emit_insn (gen_x86_64_shrd_1 (operands[0], operands[1], operands[2], operands[3]));
+    }
+  else if (rtx_equal_p (operands[1], operands[0]))
+    {
+      operands[4] = force_reg (DImode, operands[4]);
+      emit_insn (gen_x86_64_shld_1 (operands[0], operands[4], operands[3], operands[2]));
+    }
+  else
+   {
+     operands[1] = force_reg (DImode, operands[1]);
+     rtx tmp = gen_reg_rtx (DImode);
+     emit_move_insn (tmp, operands[4]);
+     emit_insn (gen_x86_64_shrd_1 (tmp, operands[1], operands[2], operands[3]));
+     emit_move_insn (operands[0], tmp);
+   }
+   DONE;
+})
+
 (define_insn_and_split "*x86_64_shrd_2"
   [(set (match_operand:DI 0 "nonimmediate_operand")
 	(ior:DI (lshiftrt:DI (match_dup 0)
@@ -13497,7 +13604,7 @@  (define_insn "x86_shrd"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
-(define_insn "*x86_shrd_1"
+(define_insn "x86_shrd_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
         (ior:SI (lshiftrt:SI (match_dup 0)
 			     (match_operand:QI 2 "const_0_to_31_operand"))
@@ -13518,6 +13625,41 @@  (define_insn "*x86_shrd_1"
    (set_attr "amdfam10_decode" "vector")
    (set_attr "bdver1_decode" "vector")])
 
+(define_insn_and_split "*x86_shrd_shld_1_nozext"
+  [(set (match_operand:SI 0 "nonimmediate_operand")
+	(ior:SI (lshiftrt:SI (match_operand:SI 4 "nonimmediate_operand")
+			     (match_operand:QI 2 "const_0_to_31_operand"))
+	       (ashift:SI
+		   (match_operand:SI 1 "nonimmediate_operand")
+		   (match_operand:QI 3 "const_0_to_31_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[3]) == 32 - INTVAL (operands[2])
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (rtx_equal_p (operands[4], operands[0]))
+    {
+      operands[1] = force_reg (SImode, operands[1]);
+      emit_insn (gen_x86_shrd_1 (operands[0], operands[1], operands[2], operands[3]));
+    }
+  else if (rtx_equal_p (operands[1], operands[0]))
+    {
+      operands[4] = force_reg (SImode, operands[4]);
+      emit_insn (gen_x86_shld_1 (operands[0], operands[4], operands[3], operands[2]));
+    }
+  else
+   {
+     operands[1] = force_reg (SImode, operands[1]);
+     rtx tmp = gen_reg_rtx (SImode);
+     emit_move_insn (tmp, operands[4]);
+     emit_insn (gen_x86_shrd_1 (tmp, operands[1], operands[2], operands[3]));
+     emit_move_insn (operands[0], tmp);
+   }
+   DONE;
+})
+
 (define_insn_and_split "*x86_shrd_2"
   [(set (match_operand:SI 0 "nonimmediate_operand")
 	(ior:SI (lshiftrt:SI (match_dup 0)
diff --git a/gcc/testsuite/gcc.target/i386/pr55583.c b/gcc/testsuite/gcc.target/i386/pr55583.c
new file mode 100644
index 00000000000..1c128b5d929
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr55583.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile  } */
+/* { dg-options "-O2 -Wno-shift-count-overflow" } */
+/* { dg-final { scan-assembler-times {(?n)shrd[ql]?[\t ]*\$2} 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times {(?n)shrdl?[\t ]*\$2} 2 { target ia32 } } } */
+/* { dg-final { scan-assembler-times {(?n)shldl?[\t ]*\$2} 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times {(?n)shld[ql]?[\t ]*\$2} 2 { target { ! ia32 } } } } */
+
+typedef unsigned long  u64;
+typedef unsigned int   u32;
+typedef unsigned short u16;
+
+long  a, b;
+int   c, d;
+short e, f;
+const int n = 2;
+
+void test64r () { b = ((u64)b >> n) | (a << (64 - n)); }
+void test32r () { d = ((u32)d >> n) | (c << (32 - n)); }
+
+unsigned long  ua, ub;
+unsigned int   uc, ud;
+unsigned short ue, uf;
+
+void testu64l () { ub = (ub << n) | (ua >> (64 - n)); }
+void testu64r () { ub = (ub >> n) | (ua << (64 - n)); }
+void testu32l () { ud = (ud << n) | (uc >> (32 - n)); }
+void testu32r () { ud = (ud >> n) | (uc << (32 - n)); }