Message ID | VI1PR0801MB21270C389F511EBFD183300583C60@VI1PR0801MB2127.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [ARM] Fix low reg issue in Thumb-2 movsi patterns | expand |
Hi Wilco, On 7/24/19 4:16 PM, Wilco Dijkstra wrote: > The Thumb-2 movsi patterns try to prefer low registers for loads and > stores. > However this is done incorrectly by using 2 separate variants with 'l' > and 'h' > register classes. The register allocator will only use low registers, and > as a result we end up with significantly more spills and moves to high > registers. Fix this by merging the alternatives and use 'l*r' to indicate > preference for low registers. This saves ~400 instructions from the > pr77308 > testcase. > > Bootstrap & regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57 > LGTM. Ok. Thanks, Kyrill > ChangeLog: > 2019-07-24 Wilco Dijkstra <wdijkstr@arm.com> > > * config/arm/thumb2.md (thumb2_movsi_insn): Fix load/store low reg. > * config/arm/vfp.md (thumb2_movsi_vfp): Likewise. > > -- > diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md > index > 78a6ea0b10dab97ed6651ce62e99cfd7a81722ab..c7000d0772a7e5887b6d05be188e8eb38c97217d > 100644 > --- a/gcc/config/arm/thumb2.md > +++ b/gcc/config/arm/thumb2.md > @@ -247,8 +247,8 @@ (define_insn "*thumb2_pop_single" > ;; regs. The high register alternatives are not taken into account when > ;; choosing register preferences in order to reflect their expense. > (define_insn "*thumb2_movsi_insn" > - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l > ,*hk,m,*m") > -(match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk"))] > + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l*rk,m") > +(match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,l*rk"))] > "TARGET_THUMB2 && !TARGET_IWMMXT && !TARGET_HARD_FLOAT > && ( register_operand (operands[0], SImode) > || register_operand (operands[1], SImode))" > @@ -262,22 +262,20 @@ (define_insn "*thumb2_movsi_insn" > case 3: return \"mvn%?\\t%0, #%B1\"; > case 4: return \"movw%?\\t%0, %1\"; > case 5: > - case 6: > /* Cannot load it directly, split to load it via MOV / MOVT. */ > if (!MEM_P (operands[1]) && arm_disable_literal_pool) > return \"#\"; > return \"ldr%?\\t%0, %1\"; > - case 7: > - case 8: return \"str%?\\t%1, %0\"; > + case 6: return \"str%?\\t%1, %0\"; > default: gcc_unreachable (); > } > } > - [(set_attr "type" > "mov_reg,mov_imm,mov_imm,mvn_imm,mov_imm,load_4,load_4,store_4,store_4") > - (set_attr "length" "2,4,2,4,4,4,4,4,4") > + [(set_attr "type" > "mov_reg,mov_imm,mov_imm,mvn_imm,mov_imm,load_4,store_4") > + (set_attr "length" "2,4,2,4,4,4,4") > (set_attr "predicable" "yes") > - (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no") > - (set_attr "pool_range" "*,*,*,*,*,1018,4094,*,*") > - (set_attr "neg_pool_range" "*,*,*,*,*,0,0,*,*")] > + (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no") > + (set_attr "pool_range" "*,*,*,*,*,4094,*") > + (set_attr "neg_pool_range" "*,*,*,*,*,0,*")] > ) > > (define_insn "tls_load_dot_plus_four" > diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md > index > e0aaa7b00bb41c046da4531a293e123c94e8b9a4..b59dd6b71d228e042feda3a3a06d81dd01d200da > 100644 > --- a/gcc/config/arm/vfp.md > +++ b/gcc/config/arm/vfp.md > @@ -258,8 +258,8 @@ (define_insn "*arm_movsi_vfp" > ;; is chosen with length 2 when the instruction is predicated for > ;; arm_restrict_it. > (define_insn "*thumb2_movsi_vfp" > - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r, > l,*hk,m, *m,*t, r,*t,*t, *Uv") > -(match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk, > r,*t,*t,*UvTu,*t"))] > + [(set (match_operand:SI 0 "nonimmediate_operand" > "=rk,r,l,r,r,l*rk,m,*t, r,*t,*t, *Uv") > +(match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,l*rk, > r,*t,*t,*UvTu,*t"))] > "TARGET_THUMB2 && TARGET_HARD_FLOAT > && ( s_register_operand (operands[0], SImode) > || s_register_operand (operands[1], SImode))" > @@ -275,32 +275,30 @@ (define_insn "*thumb2_movsi_vfp" > case 4: > return \"movw%?\\t%0, %1\"; > case 5: > - case 6: > /* Cannot load it directly, split to load it via MOV / MOVT. */ > if (!MEM_P (operands[1]) && arm_disable_literal_pool) > return \"#\"; > return \"ldr%?\\t%0, %1\"; > - case 7: > - case 8: > + case 6: > return \"str%?\\t%1, %0\"; > - case 9: > + case 7: > return \"vmov%?\\t%0, %1\\t%@ int\"; > - case 10: > + case 8: > return \"vmov%?\\t%0, %1\\t%@ int\"; > - case 11: > + case 9: > return \"vmov%?.f32\\t%0, %1\\t%@ int\"; > - case 12: case 13: > + case 10: case 11: > return output_move_vfp (operands); > default: > gcc_unreachable (); > } > " > [(set_attr "predicable" "yes") > - (set_attr "predicable_short_it" > "yes,no,yes,no,no,no,no,no,no,no,no,no,no,no") > - (set_attr "type" > "mov_reg,mov_reg,mov_reg,mvn_reg,mov_imm,load_4,load_4,store_4,store_4,f_mcr,f_mrc,fmov,f_loads,f_stores") > - (set_attr "length" "2,4,2,4,4,4,4,4,4,4,4,4,4,4") > - (set_attr "pool_range" "*,*,*,*,*,1018,4094,*,*,*,*,*,1018,*") > - (set_attr "neg_pool_range" "*,*,*,*,*, 0, 0,*,*,*,*,*,1008,*")] > + (set_attr "predicable_short_it" > "yes,no,yes,no,no,no,no,no,no,no,no,no") > + (set_attr "type" > "mov_reg,mov_reg,mov_reg,mvn_reg,mov_imm,load_4,store_4,f_mcr,f_mrc,fmov,f_loads,f_stores") > + (set_attr "length" "2,4,2,4,4,4,4,4,4,4,4,4") > + (set_attr "pool_range" "*,*,*,*,*,4094,*,*,*,*,1018,*") > + (set_attr "neg_pool_range" "*,*,*,*,*, 0,*,*,*,*,1008,*")] > ) > >
On 24/07/2019 16:16, Wilco Dijkstra wrote: > The Thumb-2 movsi patterns try to prefer low registers for loads and stores. > However this is done incorrectly by using 2 separate variants with 'l' and 'h' > register classes. The register allocator will only use low registers, and > as a result we end up with significantly more spills and moves to high > registers. Fix this by merging the alternatives and use 'l*r' to indicate > preference for low registers. This saves ~400 instructions from the pr77308 > testcase. > > Bootstrap & regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57 > > ChangeLog: > 2019-07-24 Wilco Dijkstra <wdijkstr@arm.com> > > * config/arm/thumb2.md (thumb2_movsi_insn): Fix load/store low reg. > * config/arm/vfp.md (thumb2_movsi_vfp): Likewise. > > -- > diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md > index 78a6ea0b10dab97ed6651ce62e99cfd7a81722ab..c7000d0772a7e5887b6d05be188e8eb38c97217d 100644 > --- a/gcc/config/arm/thumb2.md > +++ b/gcc/config/arm/thumb2.md > @@ -247,8 +247,8 @@ (define_insn "*thumb2_pop_single" > ;; regs. The high register alternatives are not taken into account when > ;; choosing register preferences in order to reflect their expense. > (define_insn "*thumb2_movsi_insn" > - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l ,*hk,m,*m") > - (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk"))] > + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l*rk,m") > + (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,l*rk"))] I think this should be "lk*r", not "l*rk". SP is only going to crop up in rare circumstances, but we are always going to need this pattern if it does and hiding this from register preferencing is pointless. It's not like the compiler is going to start allocating SP in the more general case. Same in the other case below, which leads to: I'd also like to see all these movsi matching patterns merged into a single pattern that just enables the appropriate variants. Having separate implementations for Arm, thumb2, vfp, iwmmx is just making maintenance of this stuff a nightmare. R. > "TARGET_THUMB2 && !TARGET_IWMMXT && !TARGET_HARD_FLOAT > && ( register_operand (operands[0], SImode) > || register_operand (operands[1], SImode))" > @@ -262,22 +262,20 @@ (define_insn "*thumb2_movsi_insn" > case 3: return \"mvn%?\\t%0, #%B1\"; > case 4: return \"movw%?\\t%0, %1\"; > case 5: > - case 6: > /* Cannot load it directly, split to load it via MOV / MOVT. */ > if (!MEM_P (operands[1]) && arm_disable_literal_pool) > return \"#\"; > return \"ldr%?\\t%0, %1\"; > - case 7: > - case 8: return \"str%?\\t%1, %0\"; > + case 6: return \"str%?\\t%1, %0\"; > default: gcc_unreachable (); > } > } > - [(set_attr "type" "mov_reg,mov_imm,mov_imm,mvn_imm,mov_imm,load_4,load_4,store_4,store_4") > - (set_attr "length" "2,4,2,4,4,4,4,4,4") > + [(set_attr "type" "mov_reg,mov_imm,mov_imm,mvn_imm,mov_imm,load_4,store_4") > + (set_attr "length" "2,4,2,4,4,4,4") > (set_attr "predicable" "yes") > - (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no") > - (set_attr "pool_range" "*,*,*,*,*,1018,4094,*,*") > - (set_attr "neg_pool_range" "*,*,*,*,*,0,0,*,*")] > + (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no") > + (set_attr "pool_range" "*,*,*,*,*,4094,*") > + (set_attr "neg_pool_range" "*,*,*,*,*,0,*")] > ) > > (define_insn "tls_load_dot_plus_four" > diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md > index e0aaa7b00bb41c046da4531a293e123c94e8b9a4..b59dd6b71d228e042feda3a3a06d81dd01d200da 100644 > --- a/gcc/config/arm/vfp.md > +++ b/gcc/config/arm/vfp.md > @@ -258,8 +258,8 @@ (define_insn "*arm_movsi_vfp" > ;; is chosen with length 2 when the instruction is predicated for > ;; arm_restrict_it. > (define_insn "*thumb2_movsi_vfp" > - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r, l,*hk,m, *m,*t, r,*t,*t, *Uv") > - (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk, r,*t,*t,*UvTu,*t"))] > + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l*rk,m,*t, r,*t,*t, *Uv") > + (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,l*rk, r,*t,*t,*UvTu,*t"))] > "TARGET_THUMB2 && TARGET_HARD_FLOAT > && ( s_register_operand (operands[0], SImode) > || s_register_operand (operands[1], SImode))" > @@ -275,32 +275,30 @@ (define_insn "*thumb2_movsi_vfp" > case 4: > return \"movw%?\\t%0, %1\"; > case 5: > - case 6: > /* Cannot load it directly, split to load it via MOV / MOVT. */ > if (!MEM_P (operands[1]) && arm_disable_literal_pool) > return \"#\"; > return \"ldr%?\\t%0, %1\"; > - case 7: > - case 8: > + case 6: > return \"str%?\\t%1, %0\"; > - case 9: > + case 7: > return \"vmov%?\\t%0, %1\\t%@ int\"; > - case 10: > + case 8: > return \"vmov%?\\t%0, %1\\t%@ int\"; > - case 11: > + case 9: > return \"vmov%?.f32\\t%0, %1\\t%@ int\"; > - case 12: case 13: > + case 10: case 11: > return output_move_vfp (operands); > default: > gcc_unreachable (); > } > " > [(set_attr "predicable" "yes") > - (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no,no,no,no,no,no") > - (set_attr "type" "mov_reg,mov_reg,mov_reg,mvn_reg,mov_imm,load_4,load_4,store_4,store_4,f_mcr,f_mrc,fmov,f_loads,f_stores") > - (set_attr "length" "2,4,2,4,4,4,4,4,4,4,4,4,4,4") > - (set_attr "pool_range" "*,*,*,*,*,1018,4094,*,*,*,*,*,1018,*") > - (set_attr "neg_pool_range" "*,*,*,*,*, 0, 0,*,*,*,*,*,1008,*")] > + (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no,no,no,no") > + (set_attr "type" "mov_reg,mov_reg,mov_reg,mvn_reg,mov_imm,load_4,store_4,f_mcr,f_mrc,fmov,f_loads,f_stores") > + (set_attr "length" "2,4,2,4,4,4,4,4,4,4,4,4") > + (set_attr "pool_range" "*,*,*,*,*,4094,*,*,*,*,1018,*") > + (set_attr "neg_pool_range" "*,*,*,*,*, 0,*,*,*,*,1008,*")] > ) > > >
Hi Richard, > I think this should be "lk*r", not "l*rk". SP is only going to crop up > in rare circumstances, but we are always going to need this pattern if > it does and hiding this from register preferencing is pointless. It's > not like the compiler is going to start allocating SP in the more > general case. '*' only applies to the next constraint, so these are equivalent. I've committed it with them swapped around, however using 'k' at all here seems redundant anyway given GCC no longer uses physical registers in most instructions. > I'd also like to see all these movsi matching patterns merged into a > single pattern that just enables the appropriate variants. Having > separate implementations for Arm, thumb2, vfp, iwmmx is just making > maintenance of this stuff a nightmare. Yes if we're sure one is a strict superset of the other, we could just merge them and disable VFP specific variants. However the patterns are quite complex so it's hard to be 100% sure. Also I think there should be no need at all for such huge patterns given a load or immediate can never become a register move and visa versa. So it should be feasible to split into register moves, loads, stores and immediates. Wilco
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 78a6ea0b10dab97ed6651ce62e99cfd7a81722ab..c7000d0772a7e5887b6d05be188e8eb38c97217d 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -247,8 +247,8 @@ (define_insn "*thumb2_pop_single" ;; regs. The high register alternatives are not taken into account when ;; choosing register preferences in order to reflect their expense. (define_insn "*thumb2_movsi_insn" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l ,*hk,m,*m") - (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk"))] + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l*rk,m") + (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,l*rk"))] "TARGET_THUMB2 && !TARGET_IWMMXT && !TARGET_HARD_FLOAT && ( register_operand (operands[0], SImode) || register_operand (operands[1], SImode))" @@ -262,22 +262,20 @@ (define_insn "*thumb2_movsi_insn" case 3: return \"mvn%?\\t%0, #%B1\"; case 4: return \"movw%?\\t%0, %1\"; case 5: - case 6: /* Cannot load it directly, split to load it via MOV / MOVT. */ if (!MEM_P (operands[1]) && arm_disable_literal_pool) return \"#\"; return \"ldr%?\\t%0, %1\"; - case 7: - case 8: return \"str%?\\t%1, %0\"; + case 6: return \"str%?\\t%1, %0\"; default: gcc_unreachable (); } } - [(set_attr "type" "mov_reg,mov_imm,mov_imm,mvn_imm,mov_imm,load_4,load_4,store_4,store_4") - (set_attr "length" "2,4,2,4,4,4,4,4,4") + [(set_attr "type" "mov_reg,mov_imm,mov_imm,mvn_imm,mov_imm,load_4,store_4") + (set_attr "length" "2,4,2,4,4,4,4") (set_attr "predicable" "yes") - (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no") - (set_attr "pool_range" "*,*,*,*,*,1018,4094,*,*") - (set_attr "neg_pool_range" "*,*,*,*,*,0,0,*,*")] + (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no") + (set_attr "pool_range" "*,*,*,*,*,4094,*") + (set_attr "neg_pool_range" "*,*,*,*,*,0,*")] ) (define_insn "tls_load_dot_plus_four" diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index e0aaa7b00bb41c046da4531a293e123c94e8b9a4..b59dd6b71d228e042feda3a3a06d81dd01d200da 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -258,8 +258,8 @@ (define_insn "*arm_movsi_vfp" ;; is chosen with length 2 when the instruction is predicated for ;; arm_restrict_it. (define_insn "*thumb2_movsi_vfp" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r, l,*hk,m, *m,*t, r,*t,*t, *Uv") - (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,*mi,l,*hk, r,*t,*t,*UvTu,*t"))] + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,l,r,r,l*rk,m,*t, r,*t,*t, *Uv") + (match_operand:SI 1 "general_operand" "rk,I,Py,K,j,mi,l*rk, r,*t,*t,*UvTu,*t"))] "TARGET_THUMB2 && TARGET_HARD_FLOAT && ( s_register_operand (operands[0], SImode) || s_register_operand (operands[1], SImode))" @@ -275,32 +275,30 @@ (define_insn "*thumb2_movsi_vfp" case 4: return \"movw%?\\t%0, %1\"; case 5: - case 6: /* Cannot load it directly, split to load it via MOV / MOVT. */ if (!MEM_P (operands[1]) && arm_disable_literal_pool) return \"#\"; return \"ldr%?\\t%0, %1\"; - case 7: - case 8: + case 6: return \"str%?\\t%1, %0\"; - case 9: + case 7: return \"vmov%?\\t%0, %1\\t%@ int\"; - case 10: + case 8: return \"vmov%?\\t%0, %1\\t%@ int\"; - case 11: + case 9: return \"vmov%?.f32\\t%0, %1\\t%@ int\"; - case 12: case 13: + case 10: case 11: return output_move_vfp (operands); default: gcc_unreachable (); } " [(set_attr "predicable" "yes") - (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no,no,no,no,no,no") - (set_attr "type" "mov_reg,mov_reg,mov_reg,mvn_reg,mov_imm,load_4,load_4,store_4,store_4,f_mcr,f_mrc,fmov,f_loads,f_stores") - (set_attr "length" "2,4,2,4,4,4,4,4,4,4,4,4,4,4") - (set_attr "pool_range" "*,*,*,*,*,1018,4094,*,*,*,*,*,1018,*") - (set_attr "neg_pool_range" "*,*,*,*,*, 0, 0,*,*,*,*,*,1008,*")] + (set_attr "predicable_short_it" "yes,no,yes,no,no,no,no,no,no,no,no,no") + (set_attr "type" "mov_reg,mov_reg,mov_reg,mvn_reg,mov_imm,load_4,store_4,f_mcr,f_mrc,fmov,f_loads,f_stores") + (set_attr "length" "2,4,2,4,4,4,4,4,4,4,4,4") + (set_attr "pool_range" "*,*,*,*,*,4094,*,*,*,*,1018,*") + (set_attr "neg_pool_range" "*,*,*,*,*, 0,*,*,*,*,1008,*")] )