Message ID | patch-14775-tamar@arm.com |
---|---|
State | New |
Headers | show |
Series | AArch64[RFC] Force complicated constant to memory when beneficial | expand |
Catching up on backlog, sorry for the very late response: Tamar Christina <tamar.christina@arm.com> writes: > Hi All, > > Consider the following case > > #include <arm_neon.h> > > uint64_t > test4 (uint8x16_t input) > { > uint8x16_t bool_input = vshrq_n_u8(input, 7); > poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL); > poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0), > vgetq_lane_p64(mask, 0)); > poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask); > uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); > return vget_lane_u16((uint16x4_t)res, 3); > } > > which generates (after my CSE patches): > > test4: > ushr v0.16b, v0.16b, 7 > mov x0, 16512 > movk x0, 0x1020, lsl 16 > movk x0, 0x408, lsl 32 > movk x0, 0x102, lsl 48 > fmov d1, x0 > pmull v2.1q, v0.1d, v1.1d > dup v1.2d, v1.d[0] > pmull2 v0.1q, v0.2d, v1.2d > trn2 v2.8b, v2.8b, v0.8b > umov w0, v2.h[3] > re > > which is suboptimal since the constant is never needed on the genreg side and > should have been materialized on the SIMD side since the constant is so big > that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov. > > The problem is that the choice of on which side to materialize the constant can > only be done during reload. We may need an extra register (to hold the > addressing) and so can't be done after reload. > > I have tried to support this with a pattern during reload, but the problem is I > can't seem to find a way to tell reload it should spill a constant under > condition x. Instead I tried with a split which reload selects when the > condition hold. If this is still an issue, one thing to try would be to put a "$" before the "r" in the GPR alternative. If that doesn't work then yeah, I think we're out of luck describing this directly. If "$" does work, it'd be interesting to see whether "^" does too. Thanks, Richard > > This has a couple of issues: > > 1. The pattern can be expanded late (could be fixed with !reload_completed). > 2. Because it's split so late we can't seem to be able to share the anchors for > the ADRP. > 3. Because it's split so late and basically reload doesn't know about the spill > and so the ADD lo12 isn't pushed into the addressing mode of the LDR. > > I don't know how to properly fix these since I think the only way is for reload > to do the spill properly itself, but in this case not having the patter makes it > avoid the mem pattern and pick r <- n instead followed by r -> w. > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W. > * config/aarch64/constraints.md (Dx): New. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64" > ) > > (define_insn_and_split "*movdi_aarch64" > - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w") > - (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] > + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w") > + (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] > "(register_operand (operands[0], DImode) > || aarch64_reg_or_zero (operands[1], DImode))" > "@ > @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64" > mov\\t%x0, %1 > mov\\t%w0, %1 > # > + # > * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]); > ldr\\t%x0, %1 > ldr\\t%d0, %1 > @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64" > fmov\\t%d0, %d1 > * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" > "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)) > - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" > + && REG_P (operands[0]) > + && (GP_REGNUM_P (REGNO (operands[0])) > + || (can_create_pseudo_p () > + && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))" > [(const_int 0)] > "{ > - aarch64_expand_mov_immediate (operands[0], operands[1]); > + if (GP_REGNUM_P (REGNO (operands[0]))) > + aarch64_expand_mov_immediate (operands[0], operands[1]); > + else > + { > + rtx mem = force_const_mem (DImode, operands[1]); > + gcc_assert (mem); > + emit_move_insn (operands[0], mem); > + } > DONE; > }" > ;; The "mov_imm" type for CNTD is just a placeholder. > - [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm, > + [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm, > load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov, > neon_move") > - (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] > + (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] > ) > > (define_insn "insv_imm<mode>" > diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md > index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -474,6 +474,14 @@ (define_address_constraint "Dp" > An address valid for a prefetch instruction." > (match_test "aarch64_address_valid_for_prefetch_p (op, true)")) > > +(define_constraint "Dx" > + "@internal > + A constraint that matches an integer immediate operand not valid\ > + for AdvSIMD scalar operations in DImode." > + (and (match_code "const_int") > + (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)") > + (match_test "!aarch64_move_imm (INTVAL (op), DImode)"))) > + > (define_constraint "vgb" > "@internal > A constraint that matches an immediate offset valid for SVE LD1B
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64" ) (define_insn_and_split "*movdi_aarch64" - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w") - (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w") + (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] "(register_operand (operands[0], DImode) || aarch64_reg_or_zero (operands[1], DImode))" "@ @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64" mov\\t%x0, %1 mov\\t%w0, %1 # + # * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]); ldr\\t%x0, %1 ldr\\t%d0, %1 @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64" fmov\\t%d0, %d1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + && REG_P (operands[0]) + && (GP_REGNUM_P (REGNO (operands[0])) + || (can_create_pseudo_p () + && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))" [(const_int 0)] "{ - aarch64_expand_mov_immediate (operands[0], operands[1]); + if (GP_REGNUM_P (REGNO (operands[0]))) + aarch64_expand_mov_immediate (operands[0], operands[1]); + else + { + rtx mem = force_const_mem (DImode, operands[1]); + gcc_assert (mem); + emit_move_insn (operands[0], mem); + } DONE; }" ;; The "mov_imm" type for CNTD is just a placeholder. - [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm, + [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm, load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov, neon_move") - (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] + (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] ) (define_insn "insv_imm<mode>" diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -474,6 +474,14 @@ (define_address_constraint "Dp" An address valid for a prefetch instruction." (match_test "aarch64_address_valid_for_prefetch_p (op, true)")) +(define_constraint "Dx" + "@internal + A constraint that matches an integer immediate operand not valid\ + for AdvSIMD scalar operations in DImode." + (and (match_code "const_int") + (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)") + (match_test "!aarch64_move_imm (INTVAL (op), DImode)"))) + (define_constraint "vgb" "@internal A constraint that matches an immediate offset valid for SVE LD1B