diff mbox series

[x86] PR target/110511: Fix reg allocation for widening multiplications.

Message ID 022301da012c$f1f00a00$d5d01e00$@nextmovesoftware.com
State New
Headers show
Series [x86] PR target/110511: Fix reg allocation for widening multiplications. | expand

Commit Message

Roger Sayle Oct. 17, 2023, 7:05 p.m. UTC
This patch contains clean-ups of the widening multiplication patterns in
i386.md, and provides variants of the existing highpart multiplication
peephole2 transformations (that tidy up register allocation after
reload), and thereby fixes PR target/110511, which is a superfluous
move instruction.

For the new test case, compiled on x86_64 with -O2.

Before:
mulx64: movabsq $-7046029254386353131, %rcx
        movq    %rcx, %rax
        mulq    %rdi
        xorq    %rdx, %rax
        ret

After:
mulx64: movabsq $-7046029254386353131, %rax
        mulq    %rdi
        xorq    %rdx, %rax
        ret

The clean-ups are (i) that operand 1 is consistently made register_operand
and operand 2 becomes nonimmediate_operand, so that predicates match the
constraints, (ii) the representation of the BMI2 mulx instruction is
updated to use the new umul_highpart RTX, and (iii) because operands
0 and 1 have different modes in widening multiplications, "a" is a more
appropriate constraint than "0" (which avoids spills/reloads containing
SUBREGs).  The new peephole2 transformations are based upon those at
around line 9951 of i386.md, that begins with the comment
;; Highpart multiplication peephole2s to tweak register allocation.
;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx  ->  mov imm,%rax; imulq %rdi


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-10-17  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        PR target/110511
        * config/i386/i386.md (<u>mul<mode><dwi>3): Make operands 1 and
        2 take "regiser_operand" and "nonimmediate_operand" respectively.
        (<u>mulqihi3): Likewise.
        (*bmi2_umul<mode><dwi>3_1): Operand 2 needs to be register_operand
        matching the %d constraint.  Use umul_highpart RTX to represent
        the highpart multiplication.
        (*umul<mode><dwi>3_1):  Operand 2 should use regiser_operand
        predicate, and "a" rather than "0" as operands 0 and 2 have
        different modes.
        (define_split): For mul to mulx conversion, use the new
        umul_highpart RTX representation.
        (*mul<mode><dwi>3_1):  Operand 1 should be register_operand
        and the constraint %a as operands 0 and 1 have different modes.
        (*<u>mulqihi3_1): Operand 1 should be register_operand matching
        the constraint %0.
        (define_peephole2): Providing widening multiplication variants
        of the peephole2s that tweak highpart multiplication register
        allocation.

gcc/testsuite/ChangeLog
        PR target/110511
        * gcc.target/i386/pr110511.c: New test case.


Thanks in advance,
Roger

Comments

Uros Bizjak Oct. 19, 2023, 5:01 p.m. UTC | #1
On Tue, Oct 17, 2023 at 9:05 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch contains clean-ups of the widening multiplication patterns in
> i386.md, and provides variants of the existing highpart multiplication
> peephole2 transformations (that tidy up register allocation after
> reload), and thereby fixes PR target/110511, which is a superfluous
> move instruction.
>
> For the new test case, compiled on x86_64 with -O2.
>
> Before:
> mulx64: movabsq $-7046029254386353131, %rcx
>         movq    %rcx, %rax
>         mulq    %rdi
>         xorq    %rdx, %rax
>         ret
>
> After:
> mulx64: movabsq $-7046029254386353131, %rax
>         mulq    %rdi
>         xorq    %rdx, %rax
>         ret
>
> The clean-ups are (i) that operand 1 is consistently made register_operand
> and operand 2 becomes nonimmediate_operand, so that predicates match the
> constraints, (ii) the representation of the BMI2 mulx instruction is
> updated to use the new umul_highpart RTX, and (iii) because operands
> 0 and 1 have different modes in widening multiplications, "a" is a more
> appropriate constraint than "0" (which avoids spills/reloads containing
> SUBREGs).  The new peephole2 transformations are based upon those at
> around line 9951 of i386.md, that begins with the comment
> ;; Highpart multiplication peephole2s to tweak register allocation.
> ;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx  ->  mov imm,%rax; imulq %rdi
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2023-10-17  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR target/110511
>         * config/i386/i386.md (<u>mul<mode><dwi>3): Make operands 1 and
>         2 take "regiser_operand" and "nonimmediate_operand" respectively.
>         (<u>mulqihi3): Likewise.
>         (*bmi2_umul<mode><dwi>3_1): Operand 2 needs to be register_operand
>         matching the %d constraint.  Use umul_highpart RTX to represent
>         the highpart multiplication.
>         (*umul<mode><dwi>3_1):  Operand 2 should use regiser_operand
>         predicate, and "a" rather than "0" as operands 0 and 2 have
>         different modes.
>         (define_split): For mul to mulx conversion, use the new
>         umul_highpart RTX representation.
>         (*mul<mode><dwi>3_1):  Operand 1 should be register_operand
>         and the constraint %a as operands 0 and 1 have different modes.
>         (*<u>mulqihi3_1): Operand 1 should be register_operand matching
>         the constraint %0.
>         (define_peephole2): Providing widening multiplication variants
>         of the peephole2s that tweak highpart multiplication register
>         allocation.
>
> gcc/testsuite/ChangeLog
>         PR target/110511
>         * gcc.target/i386/pr110511.c: New test case.
>

 (define_insn "*bmi2_umul<mode><dwi>3_1"
   [(set (match_operand:DWIH 0 "register_operand" "=r")
     (mult:DWIH
-      (match_operand:DWIH 2 "nonimmediate_operand" "%d")
+      (match_operand:DWIH 2 "register_operand" "%d")
       (match_operand:DWIH 3 "nonimmediate_operand" "rm")))
    (set (match_operand:DWIH 1 "register_operand" "=r")

This will fail. Because of %, both predicates must allow
nonimmediate_operand, since RA can swap operand constraints due to %.

@@ -9747,7 +9743,7 @@
   [(set (match_operand:<DWI> 0 "register_operand" "=r,A")
     (mult:<DWI>
       (zero_extend:<DWI>
-        (match_operand:DWIH 1 "nonimmediate_operand" "%d,0"))
+        (match_operand:DWIH 1 "register_operand" "%d,a"))
       (zero_extend:<DWI>
         (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
    (clobber (reg:CC FLAGS_REG))]

The same here, although changing "0" to "a" is correct, but the
oversight is benign. "A" reads as "ad", and the first alternative
already takes "d".

+;; Widening multiplication peephole2s to tweak register allocation.
+;; mov imm,%rdx; mov %rdi,%rax; mulq %rdx  ->  mov imm,%rax; mulq %rdi

Maybe instead of peephole2s, we allow general_operands in the
instruction (both inputs) and rely on the RA to fulfill the
constraint? Would this work?

Uros.
Roger Sayle Oct. 25, 2023, 2:41 p.m. UTC | #2
Hi Uros,

I've tried your suggestions to see what would happen.
Alas, allowing both operands to (i386's) widening multiplications
to be  nonimmediate_operand results in 90 additional testsuite
unexpected failures", and 41 unresolved testcase, around things
like:

gcc.c-torture/compile/di.c:6:1: error: unrecognizable insn:
(insn 14 13 15 2 (parallel [
            (set (reg:DI 98 [ _3 ])
                (mult:DI (zero_extend:DI (mem/c:SI (plus:SI (reg/f:SI 93 virtual-stack-vars)
                                (const_int -8 [0xfffffffffffffff8])) [1 a+0 S4 A64]))
                    (zero_extend:DI (mem/c:SI (plus:SI (reg/f:SI 93 virtual-stack-vars)
                                (const_int -16 [0xfffffffffffffff0])) [1 b+0 S4 A64]))))
            (clobber (reg:CC 17 flags))
        ]) "gcc.c-torture/compile/di.c":5:12 -1
     (nil))
during RTL pass: vregs
gcc.c-torture/compile/di.c:6:1: internal compiler error: in extract_insn, at recog.cc:2791

In my experiments, I've used nonimmediate_operand instead of general_operand,
as a zero_extend of an immediate_operand, like const_int, would be non-canonical.

In short, it's ok (common) for '%' to apply to operands with different predicates;
reload will only swap things if the operand's predicates/constraints remain consistent.
For example, see i386.c's *add<mode>_1 pattern.  And as shown above it can't
be left to (until) reload to decide which "mem" gets loaded into a register (which
would be nice), as some passes before reload check both predicates and constraints.

My original patch fixes PR 110511, using the same peephole2 idiom as already
used elsewhere in i386.md.  Ok for mainline?

> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 19 October 2023 18:02
> To: Roger Sayle <roger@nextmovesoftware.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86 PATCH] PR target/110511: Fix reg allocation for widening
> multiplications.
> 
> On Tue, Oct 17, 2023 at 9:05 PM Roger Sayle <roger@nextmovesoftware.com>
> wrote:
> >
> >
> > This patch contains clean-ups of the widening multiplication patterns
> > in i386.md, and provides variants of the existing highpart
> > multiplication
> > peephole2 transformations (that tidy up register allocation after
> > reload), and thereby fixes PR target/110511, which is a superfluous
> > move instruction.
> >
> > For the new test case, compiled on x86_64 with -O2.
> >
> > Before:
> > mulx64: movabsq $-7046029254386353131, %rcx
> >         movq    %rcx, %rax
> >         mulq    %rdi
> >         xorq    %rdx, %rax
> >         ret
> >
> > After:
> > mulx64: movabsq $-7046029254386353131, %rax
> >         mulq    %rdi
> >         xorq    %rdx, %rax
> >         ret
> >
> > The clean-ups are (i) that operand 1 is consistently made
> > register_operand and operand 2 becomes nonimmediate_operand, so that
> > predicates match the constraints, (ii) the representation of the BMI2
> > mulx instruction is updated to use the new umul_highpart RTX, and
> > (iii) because operands
> > 0 and 1 have different modes in widening multiplications, "a" is a
> > more appropriate constraint than "0" (which avoids spills/reloads
> > containing SUBREGs).  The new peephole2 transformations are based upon
> > those at around line 9951 of i386.md, that begins with the comment ;;
> > Highpart multiplication peephole2s to tweak register allocation.
> > ;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx  ->  mov imm,%rax; imulq
> > %rdi
> >
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32}
> > with no new failures.  Ok for mainline?
> >
> >
> > 2023-10-17  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         PR target/110511
> >         * config/i386/i386.md (<u>mul<mode><dwi>3): Make operands 1 and
> >         2 take "regiser_operand" and "nonimmediate_operand" respectively.
> >         (<u>mulqihi3): Likewise.
> >         (*bmi2_umul<mode><dwi>3_1): Operand 2 needs to be register_operand
> >         matching the %d constraint.  Use umul_highpart RTX to represent
> >         the highpart multiplication.
> >         (*umul<mode><dwi>3_1):  Operand 2 should use regiser_operand
> >         predicate, and "a" rather than "0" as operands 0 and 2 have
> >         different modes.
> >         (define_split): For mul to mulx conversion, use the new
> >         umul_highpart RTX representation.
> >         (*mul<mode><dwi>3_1):  Operand 1 should be register_operand
> >         and the constraint %a as operands 0 and 1 have different modes.
> >         (*<u>mulqihi3_1): Operand 1 should be register_operand matching
> >         the constraint %0.
> >         (define_peephole2): Providing widening multiplication variants
> >         of the peephole2s that tweak highpart multiplication register
> >         allocation.
> >
> > gcc/testsuite/ChangeLog
> >         PR target/110511
> >         * gcc.target/i386/pr110511.c: New test case.
> >
> 
>  (define_insn "*bmi2_umul<mode><dwi>3_1"
>    [(set (match_operand:DWIH 0 "register_operand" "=r")
>      (mult:DWIH
> -      (match_operand:DWIH 2 "nonimmediate_operand" "%d")
> +      (match_operand:DWIH 2 "register_operand" "%d")
>        (match_operand:DWIH 3 "nonimmediate_operand" "rm")))
>     (set (match_operand:DWIH 1 "register_operand" "=r")
> 
> This will fail. Because of %, both predicates must allow nonimmediate_operand,
> since RA can swap operand constraints due to %.
> 
> @@ -9747,7 +9743,7 @@
>    [(set (match_operand:<DWI> 0 "register_operand" "=r,A")
>      (mult:<DWI>
>        (zero_extend:<DWI>
> -        (match_operand:DWIH 1 "nonimmediate_operand" "%d,0"))
> +        (match_operand:DWIH 1 "register_operand" "%d,a"))
>        (zero_extend:<DWI>
>          (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
>     (clobber (reg:CC FLAGS_REG))]
> 
> The same here, although changing "0" to "a" is correct, but the oversight is
> benign. "A" reads as "ad", and the first alternative already takes "d".
> 
> +;; Widening multiplication peephole2s to tweak register allocation.
> +;; mov imm,%rdx; mov %rdi,%rax; mulq %rdx  ->  mov imm,%rax; mulq %rdi
> 
> Maybe instead of peephole2s, we allow general_operands in the instruction (both
> inputs) and rely on the RA to fulfill the constraint? Would this work?
> 
> Uros.
Uros Bizjak Oct. 26, 2023, 9:22 a.m. UTC | #3
On Wed, Oct 25, 2023 at 4:41 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
> Hi Uros,
>
> I've tried your suggestions to see what would happen.
> Alas, allowing both operands to (i386's) widening multiplications
> to be  nonimmediate_operand results in 90 additional testsuite
> unexpected failures", and 41 unresolved testcase, around things
> like:
>
> gcc.c-torture/compile/di.c:6:1: error: unrecognizable insn:
> (insn 14 13 15 2 (parallel [
>             (set (reg:DI 98 [ _3 ])
>                 (mult:DI (zero_extend:DI (mem/c:SI (plus:SI (reg/f:SI 93 virtual-stack-vars)
>                                 (const_int -8 [0xfffffffffffffff8])) [1 a+0 S4 A64]))
>                     (zero_extend:DI (mem/c:SI (plus:SI (reg/f:SI 93 virtual-stack-vars)
>                                 (const_int -16 [0xfffffffffffffff0])) [1 b+0 S4 A64]))))
>             (clobber (reg:CC 17 flags))
>         ]) "gcc.c-torture/compile/di.c":5:12 -1
>      (nil))
> during RTL pass: vregs
> gcc.c-torture/compile/di.c:6:1: internal compiler error: in extract_insn, at recog.cc:2791
>
> In my experiments, I've used nonimmediate_operand instead of general_operand,
> as a zero_extend of an immediate_operand, like const_int, would be non-canonical.
>
> In short, it's ok (common) for '%' to apply to operands with different predicates;
> reload will only swap things if the operand's predicates/constraints remain consistent.
> For example, see i386.c's *add<mode>_1 pattern.  And as shown above it can't
> be left to (until) reload to decide which "mem" gets loaded into a register (which
> would be nice), as some passes before reload check both predicates and constraints.
>
> My original patch fixes PR 110511, using the same peephole2 idiom as already
> used elsewhere in i386.md.  Ok for mainline?

Thanks for the explanation. The patch is OK.

> > -----Original Message-----
> > From: Uros Bizjak <ubizjak@gmail.com>
> > Sent: 19 October 2023 18:02
> > To: Roger Sayle <roger@nextmovesoftware.com>
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: Re: [x86 PATCH] PR target/110511: Fix reg allocation for widening
> > multiplications.
> >
> > On Tue, Oct 17, 2023 at 9:05 PM Roger Sayle <roger@nextmovesoftware.com>
> > wrote:
> > >
> > >
> > > This patch contains clean-ups of the widening multiplication patterns
> > > in i386.md, and provides variants of the existing highpart
> > > multiplication
> > > peephole2 transformations (that tidy up register allocation after
> > > reload), and thereby fixes PR target/110511, which is a superfluous
> > > move instruction.
> > >
> > > For the new test case, compiled on x86_64 with -O2.
> > >
> > > Before:
> > > mulx64: movabsq $-7046029254386353131, %rcx
> > >         movq    %rcx, %rax
> > >         mulq    %rdi
> > >         xorq    %rdx, %rax
> > >         ret
> > >
> > > After:
> > > mulx64: movabsq $-7046029254386353131, %rax
> > >         mulq    %rdi
> > >         xorq    %rdx, %rax
> > >         ret
> > >
> > > The clean-ups are (i) that operand 1 is consistently made
> > > register_operand and operand 2 becomes nonimmediate_operand, so that
> > > predicates match the constraints, (ii) the representation of the BMI2
> > > mulx instruction is updated to use the new umul_highpart RTX, and
> > > (iii) because operands
> > > 0 and 1 have different modes in widening multiplications, "a" is a
> > > more appropriate constraint than "0" (which avoids spills/reloads
> > > containing SUBREGs).  The new peephole2 transformations are based upon
> > > those at around line 9951 of i386.md, that begins with the comment ;;
> > > Highpart multiplication peephole2s to tweak register allocation.
> > > ;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx  ->  mov imm,%rax; imulq
> > > %rdi
> > >
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > > and make -k check, both with and without --target_board=unix{-m32}
> > > with no new failures.  Ok for mainline?
> > >
> > >
> > > 2023-10-17  Roger Sayle  <roger@nextmovesoftware.com>
> > >
> > > gcc/ChangeLog
> > >         PR target/110511
> > >         * config/i386/i386.md (<u>mul<mode><dwi>3): Make operands 1 and
> > >         2 take "regiser_operand" and "nonimmediate_operand" respectively.
> > >         (<u>mulqihi3): Likewise.
> > >         (*bmi2_umul<mode><dwi>3_1): Operand 2 needs to be register_operand
> > >         matching the %d constraint.  Use umul_highpart RTX to represent
> > >         the highpart multiplication.
> > >         (*umul<mode><dwi>3_1):  Operand 2 should use regiser_operand
> > >         predicate, and "a" rather than "0" as operands 0 and 2 have
> > >         different modes.
> > >         (define_split): For mul to mulx conversion, use the new
> > >         umul_highpart RTX representation.
> > >         (*mul<mode><dwi>3_1):  Operand 1 should be register_operand
> > >         and the constraint %a as operands 0 and 1 have different modes.
> > >         (*<u>mulqihi3_1): Operand 1 should be register_operand matching
> > >         the constraint %0.
> > >         (define_peephole2): Providing widening multiplication variants
> > >         of the peephole2s that tweak highpart multiplication register
> > >         allocation.
> > >
> > > gcc/testsuite/ChangeLog
> > >         PR target/110511
> > >         * gcc.target/i386/pr110511.c: New test case.

OK.

Thanks,
Uros.

> > >
> >
> >  (define_insn "*bmi2_umul<mode><dwi>3_1"
> >    [(set (match_operand:DWIH 0 "register_operand" "=r")
> >      (mult:DWIH
> > -      (match_operand:DWIH 2 "nonimmediate_operand" "%d")
> > +      (match_operand:DWIH 2 "register_operand" "%d")
> >        (match_operand:DWIH 3 "nonimmediate_operand" "rm")))
> >     (set (match_operand:DWIH 1 "register_operand" "=r")
> >
> > This will fail. Because of %, both predicates must allow nonimmediate_operand,
> > since RA can swap operand constraints due to %.
> >
> > @@ -9747,7 +9743,7 @@
> >    [(set (match_operand:<DWI> 0 "register_operand" "=r,A")
> >      (mult:<DWI>
> >        (zero_extend:<DWI>
> > -        (match_operand:DWIH 1 "nonimmediate_operand" "%d,0"))
> > +        (match_operand:DWIH 1 "register_operand" "%d,a"))
> >        (zero_extend:<DWI>
> >          (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
> >     (clobber (reg:CC FLAGS_REG))]
> >
> > The same here, although changing "0" to "a" is correct, but the oversight is
> > benign. "A" reads as "ad", and the first alternative already takes "d".
> >
> > +;; Widening multiplication peephole2s to tweak register allocation.
> > +;; mov imm,%rdx; mov %rdi,%rax; mulq %rdx  ->  mov imm,%rax; mulq %rdi
> >
> > Maybe instead of peephole2s, we allow general_operands in the instruction (both
> > inputs) and rely on the RA to fulfill the constraint? Would this work?
> >
> > Uros.
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 2a60df5..22f18c2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9710,33 +9710,29 @@ 
   [(parallel [(set (match_operand:<DWI> 0 "register_operand")
 		   (mult:<DWI>
 		     (any_extend:<DWI>
-		       (match_operand:DWIH 1 "nonimmediate_operand"))
+		       (match_operand:DWIH 1 "register_operand"))
 		     (any_extend:<DWI>
-		       (match_operand:DWIH 2 "register_operand"))))
+		       (match_operand:DWIH 2 "nonimmediate_operand"))))
 	      (clobber (reg:CC FLAGS_REG))])])
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand")
 		   (mult:HI
 		     (any_extend:HI
-		       (match_operand:QI 1 "nonimmediate_operand"))
+		       (match_operand:QI 1 "register_operand"))
 		     (any_extend:HI
-		       (match_operand:QI 2 "register_operand"))))
+		       (match_operand:QI 2 "nonimmediate_operand"))))
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
 (define_insn "*bmi2_umul<mode><dwi>3_1"
   [(set (match_operand:DWIH 0 "register_operand" "=r")
 	(mult:DWIH
-	  (match_operand:DWIH 2 "nonimmediate_operand" "%d")
+	  (match_operand:DWIH 2 "register_operand" "%d")
 	  (match_operand:DWIH 3 "nonimmediate_operand" "rm")))
    (set (match_operand:DWIH 1 "register_operand" "=r")
-	(truncate:DWIH
-	  (lshiftrt:<DWI>
-	    (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
-			(zero_extend:<DWI> (match_dup 3)))
-	    (match_operand:QI 4 "const_int_operand"))))]
-  "TARGET_BMI2 && INTVAL (operands[4]) == <MODE_SIZE> * BITS_PER_UNIT
+	(umul_highpart:DWIH (match_dup 2) (match_dup 3)))]
+  "TARGET_BMI2
    && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
   "mulx\t{%3, %0, %1|%1, %0, %3}"
   [(set_attr "type" "imulx")
@@ -9747,7 +9743,7 @@ 
   [(set (match_operand:<DWI> 0 "register_operand" "=r,A")
 	(mult:<DWI>
 	  (zero_extend:<DWI>
-	    (match_operand:DWIH 1 "nonimmediate_operand" "%d,0"))
+	    (match_operand:DWIH 1 "register_operand" "%d,a"))
 	  (zero_extend:<DWI>
 	    (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
    (clobber (reg:CC FLAGS_REG))]
@@ -9783,11 +9779,7 @@ 
   [(parallel [(set (match_dup 3)
 		   (mult:DWIH (match_dup 1) (match_dup 2)))
 	      (set (match_dup 4)
-		   (truncate:DWIH
-		     (lshiftrt:<DWI>
-		       (mult:<DWI> (zero_extend:<DWI> (match_dup 1))
-				   (zero_extend:<DWI> (match_dup 2)))
-		       (match_dup 5))))])]
+		   (umul_highpart:DWIH (match_dup 1) (match_dup 2)))])]
 {
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[3], &operands[4]);
 
@@ -9798,7 +9790,7 @@ 
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
 	  (sign_extend:<DWI>
-	    (match_operand:DWIH 1 "nonimmediate_operand" "%0"))
+	    (match_operand:DWIH 1 "register_operand" "%a"))
 	  (sign_extend:<DWI>
 	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))
    (clobber (reg:CC FLAGS_REG))]
@@ -9818,7 +9810,7 @@ 
   [(set (match_operand:HI 0 "register_operand" "=a")
 	(mult:HI
 	  (any_extend:HI
-	    (match_operand:QI 1 "nonimmediate_operand" "%0"))
+	    (match_operand:QI 1 "register_operand" "%0"))
 	  (any_extend:HI
 	    (match_operand:QI 2 "nonimmediate_operand" "qm"))))
    (clobber (reg:CC FLAGS_REG))]
@@ -9835,6 +9827,51 @@ 
    (set_attr "bdver1_decode" "direct")
    (set_attr "mode" "QI")])
 
+;; Widening multiplication peephole2s to tweak register allocation.
+;; mov imm,%rdx; mov %rdi,%rax; mulq %rdx  ->  mov imm,%rax; mulq %rdi
+(define_peephole2
+  [(set (match_operand:DWIH 0 "general_reg_operand")
+	(match_operand:DWIH 1 "immediate_operand"))
+   (set (match_operand:DWIH 2 "general_reg_operand")
+	(match_operand:DWIH 3 "general_reg_operand"))
+   (parallel [(set (match_operand:<DWI> 4 "general_reg_operand")
+		   (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
+			       (zero_extend:<DWI> (match_dup 0))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[3]) != AX_REG
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && REGNO (operands[0]) != REGNO (operands[3])
+   && (REGNO (operands[0]) == REGNO (operands[4])
+       || REGNO (operands[0]) == DX_REG
+       || peep2_reg_dead_p (3, operands[0]))"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 4)
+		   (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
+			       (zero_extend:<DWI> (match_dup 3))))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; mov imm,%rax; mov %rdi,%rdx; mulx %rax  ->  mov imm,%rdx; mulx %rdi
+(define_peephole2
+  [(set (match_operand:DWIH 0 "general_reg_operand")
+	(match_operand:DWIH 1 "immediate_operand"))
+   (set (match_operand:DWIH 2 "general_reg_operand")
+	(match_operand:DWIH 3 "general_reg_operand"))
+   (parallel [(set (match_operand:DWIH 4 "general_reg_operand")
+		   (mult:DWIH (match_dup 2) (match_dup 0)))
+	      (set (match_operand:DWIH 5 "general_reg_operand")
+		   (umul_highpart:DWIH (match_dup 2) (match_dup 0)))])]
+  "REGNO (operands[3]) != DX_REG
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && REGNO (operands[0]) != REGNO (operands[3])
+   && (REGNO (operands[0]) == REGNO (operands[4])
+       || REGNO (operands[0]) == REGNO (operands[5])
+       || peep2_reg_dead_p (3, operands[0]))"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 4)
+		   (mult:DWIH (match_dup 2) (match_dup 3)))
+	      (set (match_dup 5)
+		   (umul_highpart:DWIH (match_dup 2) (match_dup 3)))])])
+
 ;; Highpart multiplication patterns
 (define_insn "<s>mul<mode>3_highpart"
   [(set (match_operand:DWIH 0 "register_operand" "=d")
diff --git a/gcc/testsuite/gcc.target/i386/pr110511.c b/gcc/testsuite/gcc.target/i386/pr110511.c
new file mode 100644
index 0000000..142b808
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110511.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+typedef unsigned long long uint64_t;
+
+uint64_t mulx64(uint64_t x)
+{
+    __uint128_t r = (__uint128_t)x * 0x9E3779B97F4A7C15ull;
+    return (uint64_t)r ^ (uint64_t)( r >> 64 );
+}
+
+/* { dg-final { scan-assembler-not "movq" } } */