Message ID | 1294440183-885-6-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On Fri, Jan 07, 2011 at 02:43:01PM -0800, Richard Henderson wrote: > Special case deposits that are implementable with byte and word stores. > Otherwise implement with double-word shift plus rotates. > > Expose tcg_reg_alloc to the backend for allocation of scratch registers. > There's an edge condition that cannot actually happen at the moment due > to a bug elsewhere in the register allocator, but it doesn't seem right > to leave that unfixed. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/i386/tcg-target.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- > tcg/i386/tcg-target.h | 2 + > tcg/tcg.c | 2 + > 3 files changed, 69 insertions(+), 3 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index bb19a95..cc7d266 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -258,7 +258,8 @@ static inline int tcg_target_const_match(tcg_target_long val, > #define OPC_JMP_long (0xe9) > #define OPC_JMP_short (0xeb) > #define OPC_LEA (0x8d) > -#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ > +#define OPC_MOVB_EbGb (0x88) /* stores, more or less */ > +#define OPC_MOVB_GbEb (0x8a) /* loads, more or less */ > #define OPC_MOVL_EvGv (0x89) /* stores, more or less */ > #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ > #define OPC_MOVL_EvIz (0xc7) > @@ -277,6 +278,7 @@ static inline int tcg_target_const_match(tcg_target_long val, > #define OPC_SHIFT_1 (0xd1) > #define OPC_SHIFT_Ib (0xc1) > #define OPC_SHIFT_cl (0xd3) > +#define OPC_SHRD_Ib (0xac | P_EXT) > #define OPC_TESTL (0x85) > #define OPC_XCHG_ax_r32 (0x90) > > @@ -710,6 +712,59 @@ static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) > } > } > > +static void tcg_out_deposit(TCGContext *s, int inout, int val, > + unsigned ofs, unsigned len, int rexw) > +{ > + /* Look for MOVW special case. */ > + if (ofs == 0 && len == 16) { > + tcg_out_modrm(s, OPC_MOVL_GvEv + P_DATA16, inout, val); > + return; > + } > + > + /* Look for MOVB w/ %reg_l special case. */ > + if (ofs == 0 && len == 8 > + && (TCG_TARGET_REG_BITS == 64 || (inout < 4 && val < 4))) { > + tcg_out_modrm(s, OPC_MOVB_GbEb + P_REXB_R + P_REXB_RM, inout, val); > + return; > + } > + > + /* Look for MOVB w/ %reg_h special case. */ > + if (ofs == 8 && len == 8 && inout < 4 && val < 4) { > + tcg_out_modrm(s, OPC_MOVB_GbEb, inout + 4, val); > + return; > + } > + > + /* If we have a real deposit from self, we need a temporary. */ > + /* ??? There really ought to be a way to easily allocate a scratch. */ > + if (inout == val) { > + TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32; > + TCGRegSet inuse = s->reserved_regs; > + > + tcg_regset_set_reg(inuse, inout); > + val = tcg_reg_alloc(s, tcg_target_available_regs[type], inuse); > + > + tcg_out_mov(s, type, val, inout); I am a bit worried by allocating a new register here, especially on the i386 target, where the number of free registers is quite low, and often 0. We already had to tweak some code to avoid calls to tcg_abort() due to missing registers. > + } > + > + /* Arrange for the field to be at offset 0. */ > + if (ofs != 0) { > + tcg_out_shifti(s, SHIFT_ROR + rexw, inout, ofs); > + } > + > + /* Shift the value into the top of the word. This shifts the old > + field out of the bottom of the word and leaves us with the whole > + word rotated right by the size of the field. */ > + tcg_out_modrm(s, OPC_SHRD_Ib + rexw, val, inout); > + tcg_out8(s, len); > + > + /* Restore the field to its proper location. */ > + ofs = (len + ofs) & (rexw ? 63 : 31); > + if (ofs != 0) { > + tcg_out_shifti(s, SHIFT_ROL + rexw, inout, ofs); > + } > +} > + > + > /* Use SMALL != 0 to force a short forward branch. */ > static void tcg_out_jxx(TCGContext *s, int opc, int label_index, int small) > { > @@ -1266,7 +1321,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi, > > switch (sizeop) { > case 0: > - tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R, datalo, base, ofs); > + tcg_out_modrm_offset(s, OPC_MOVB_EbGb + P_REXB_R, datalo, base, ofs); > break; > case 1: > if (bswap) { > @@ -1504,7 +1559,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > break; > > OP_32_64(st8): > - tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, > + tcg_out_modrm_offset(s, OPC_MOVB_EbGb | P_REXB_R, > args[0], args[1], args[2]); > break; > OP_32_64(st16): > @@ -1603,6 +1658,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > } > break; > > + OP_32_64(deposit): > + tcg_out_deposit(s, args[0], args[2], > + args[3] >> 8, args[3] & 0xff, rexw); > + break; > + > case INDEX_op_brcond_i32: > tcg_out_brcond32(s, args[2], args[0], args[1], const_args[1], > args[3], 0); > @@ -1783,6 +1843,7 @@ static const TCGTargetOpDef x86_op_defs[] = { > { INDEX_op_sar_i32, { "r", "0", "ci" } }, > { INDEX_op_rotl_i32, { "r", "0", "ci" } }, > { INDEX_op_rotr_i32, { "r", "0", "ci" } }, > + { INDEX_op_deposit_i32, { "r", "0", "r" } }, > > { INDEX_op_brcond_i32, { "r", "ri" } }, > > @@ -1835,6 +1896,7 @@ static const TCGTargetOpDef x86_op_defs[] = { > { INDEX_op_sar_i64, { "r", "0", "ci" } }, > { INDEX_op_rotl_i64, { "r", "0", "ci" } }, > { INDEX_op_rotr_i64, { "r", "0", "ci" } }, > + { INDEX_op_deposit_i64, { "r", "0", "r" } }, > > { INDEX_op_brcond_i64, { "r", "re" } }, > { INDEX_op_setcond_i64, { "r", "r", "re" } }, > diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h > index bfafbfc..9f90d17 100644 > --- a/tcg/i386/tcg-target.h > +++ b/tcg/i386/tcg-target.h > @@ -77,6 +77,7 @@ enum { > /* optional instructions */ > #define TCG_TARGET_HAS_div2_i32 > #define TCG_TARGET_HAS_rot_i32 > +#define TCG_TARGET_HAS_deposit_i32 > #define TCG_TARGET_HAS_ext8s_i32 > #define TCG_TARGET_HAS_ext16s_i32 > #define TCG_TARGET_HAS_ext8u_i32 > @@ -94,6 +95,7 @@ enum { > #if TCG_TARGET_REG_BITS == 64 > #define TCG_TARGET_HAS_div2_i64 > #define TCG_TARGET_HAS_rot_i64 > +#define TCG_TARGET_HAS_deposit_i64 > #define TCG_TARGET_HAS_ext8s_i64 > #define TCG_TARGET_HAS_ext16s_i64 > #define TCG_TARGET_HAS_ext32s_i64 > diff --git a/tcg/tcg.c b/tcg/tcg.c > index e95a42f..5ab9122 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -156,6 +156,8 @@ int gen_new_label(void) > return idx; > } > > +static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2); > + > #include "tcg-target.c" > > /* pool based memory allocation */ > -- > 1.7.2.3 > >
On 01/09/2011 01:53 PM, Aurelien Jarno wrote: >> + if (inout == val) { >> + TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32; >> + TCGRegSet inuse = s->reserved_regs; >> + >> + tcg_regset_set_reg(inuse, inout); >> + val = tcg_reg_alloc(s, tcg_target_available_regs[type], inuse); >> + >> + tcg_out_mov(s, type, val, inout); > > I am a bit worried by allocating a new register here, especially on the > i386 target, where the number of free registers is quite low, and often > 0. We already had to tweak some code to avoid calls to tcg_abort() due > to missing registers. Well, as I said, this case can't actually trigger due to a bug in the register allocator. This can be seen in an insn like mov %dl,%dh where you would expect to see deposit x,x,x,8,8 however, the matching constraint forces the destination and the matching source into a new register: /* if the input is aliased to an output and if it is not dead after the instruction, we must allocate a new register and move it */ if (!IS_DEAD_IARG(i - nb_oargs)) goto allocate_in_reg; which means that we'll always see mov y,x deposit y,y,x,8,8 So I could simply put a tcg_abort there. It would be up to whoever improves the register allocator to provide some mechanism for a backend to allocate a scratch. What do you think? r~
On Sun, Jan 09, 2011 at 02:55:13PM -0800, Richard Henderson wrote: > On 01/09/2011 01:53 PM, Aurelien Jarno wrote: > >> + if (inout == val) { > >> + TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32; > >> + TCGRegSet inuse = s->reserved_regs; > >> + > >> + tcg_regset_set_reg(inuse, inout); > >> + val = tcg_reg_alloc(s, tcg_target_available_regs[type], inuse); > >> + > >> + tcg_out_mov(s, type, val, inout); > > > > I am a bit worried by allocating a new register here, especially on the > > i386 target, where the number of free registers is quite low, and often > > 0. We already had to tweak some code to avoid calls to tcg_abort() due > > to missing registers. > > Well, as I said, this case can't actually trigger due to a bug in the > register allocator. This can be seen in an insn like > > mov %dl,%dh > > where you would expect to see > > deposit x,x,x,8,8 > > however, the matching constraint forces the destination and the matching > source into a new register: > > /* if the input is aliased to an output and if it is > not dead after the instruction, we must allocate > a new register and move it */ > if (!IS_DEAD_IARG(i - nb_oargs)) > goto allocate_in_reg; I guess we should also add a case when the input argument and the output argument are the same. > which means that we'll always see > > mov y,x > deposit y,y,x,8,8 > > So I could simply put a tcg_abort there. It would be up to whoever > improves the register allocator to provide some mechanism for a > backend to allocate a scratch. What do you think? > The code being written now or latter doesn't change the question to know if it is always possible to allocate one scratch register here on i386. Up to know we didn't provide any function for allocating a scratch register, as on CISC CPU the op directly match to an instruction whereas on RISC CPUs where the op has to be emulated with multiple instructions, they are enough registers available to define one or more permanent scratch register.
On 01/09/2011 04:16 PM, Aurelien Jarno wrote: > The code being written now or latter doesn't change the question to know > if it is always possible to allocate one scratch register here on i386. Yes. Here there's only one register that needs to remain live. In the worst case, we'll spill one live register. Which we can always do via a simple store via AREG0 + offset. r~
On Sun, Jan 09, 2011 at 04:43:22PM -0800, Richard Henderson wrote: > On 01/09/2011 04:16 PM, Aurelien Jarno wrote: > > The code being written now or latter doesn't change the question to know > > if it is always possible to allocate one scratch register here on i386. > > Yes. > > Here there's only one register that needs to remain live. In the > worst case, we'll spill one live register. Which we can always do > via a simple store via AREG0 + offset. Oh right correct we are inside the instruction and in a case we have very few arguments, and only 32-bit. If we allow the target to allocate scratch registers themselves, it's probably better to do that via a function.
On Sun, Jan 09, 2011 at 02:55:13PM -0800, Richard Henderson wrote: > On 01/09/2011 01:53 PM, Aurelien Jarno wrote: > >> + if (inout == val) { > >> + TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32; > >> + TCGRegSet inuse = s->reserved_regs; > >> + > >> + tcg_regset_set_reg(inuse, inout); > >> + val = tcg_reg_alloc(s, tcg_target_available_regs[type], inuse); > >> + > >> + tcg_out_mov(s, type, val, inout); > > > > I am a bit worried by allocating a new register here, especially on the > > i386 target, where the number of free registers is quite low, and often > > 0. We already had to tweak some code to avoid calls to tcg_abort() due > > to missing registers. > > Well, as I said, this case can't actually trigger due to a bug in the > register allocator. This can be seen in an insn like > > mov %dl,%dh > > where you would expect to see > > deposit x,x,x,8,8 > > however, the matching constraint forces the destination and the matching > source into a new register: > > /* if the input is aliased to an output and if it is > not dead after the instruction, we must allocate > a new register and move it */ > if (!IS_DEAD_IARG(i - nb_oargs)) > goto allocate_in_reg; I have not been able to trigger this code path with a deposit instruction. > which means that we'll always see > > mov y,x > deposit y,y,x,8,8 > > So I could simply put a tcg_abort there. It would be up to whoever > improves the register allocator to provide some mechanism for a > backend to allocate a scratch. What do you think? > Do you have a way to trigger this problem? or a dump of the ops and asm output?
On 01/10/2011 10:37 AM, Aurelien Jarno wrote: >> mov y,x >> deposit y,y,x,8,8 >> >> So I could simply put a tcg_abort there. It would be up to whoever >> improves the register allocator to provide some mechanism for a >> backend to allocate a scratch. What do you think? >> > > Do you have a way to trigger this problem? or a dump of the ops and asm > output? IN: 0x408120c4: rlwimi r4,r4,8,16,23 OP: ---- 0x408120c4 deposit_i32 r4,r4,r4,8,8 goto_tb $0x0 movi_i32 nip,$0x408120c8 exit_tb $0x7fbb00ca5758 OUT: [size=52] 0x60294380: mov 0x10(%r14),%ebp 0x60294384: mov %ebp,%ebx 0x60294386: ror $0x8,%ebp 0x60294389: shrd $0x8,%ebx,%ebp 0x6029438d: rol $0x10,%ebp 0x60294390: mov %ebp,0x10(%r14) 0x60294394: jmpq 0x60294399 0x60294399: mov $0x408120c8,%ebp 0x6029439e: mov %ebp,0x25c(%r14) 0x602943a5: mov $0x7fbb00ca5758,%rax 0x602943af: jmpq 0x622b772e That should do it. This is present in linux-user-test ppc/ls. This output still contains that allocate-a-scratch path. r~
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index bb19a95..cc7d266 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -258,7 +258,8 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_JMP_long (0xe9) #define OPC_JMP_short (0xeb) #define OPC_LEA (0x8d) -#define OPC_MOVB_EvGv (0x88) /* stores, more or less */ +#define OPC_MOVB_EbGb (0x88) /* stores, more or less */ +#define OPC_MOVB_GbEb (0x8a) /* loads, more or less */ #define OPC_MOVL_EvGv (0x89) /* stores, more or less */ #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */ #define OPC_MOVL_EvIz (0xc7) @@ -277,6 +278,7 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_SHIFT_1 (0xd1) #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) +#define OPC_SHRD_Ib (0xac | P_EXT) #define OPC_TESTL (0x85) #define OPC_XCHG_ax_r32 (0x90) @@ -710,6 +712,59 @@ static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) } } +static void tcg_out_deposit(TCGContext *s, int inout, int val, + unsigned ofs, unsigned len, int rexw) +{ + /* Look for MOVW special case. */ + if (ofs == 0 && len == 16) { + tcg_out_modrm(s, OPC_MOVL_GvEv + P_DATA16, inout, val); + return; + } + + /* Look for MOVB w/ %reg_l special case. */ + if (ofs == 0 && len == 8 + && (TCG_TARGET_REG_BITS == 64 || (inout < 4 && val < 4))) { + tcg_out_modrm(s, OPC_MOVB_GbEb + P_REXB_R + P_REXB_RM, inout, val); + return; + } + + /* Look for MOVB w/ %reg_h special case. */ + if (ofs == 8 && len == 8 && inout < 4 && val < 4) { + tcg_out_modrm(s, OPC_MOVB_GbEb, inout + 4, val); + return; + } + + /* If we have a real deposit from self, we need a temporary. */ + /* ??? There really ought to be a way to easily allocate a scratch. */ + if (inout == val) { + TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32; + TCGRegSet inuse = s->reserved_regs; + + tcg_regset_set_reg(inuse, inout); + val = tcg_reg_alloc(s, tcg_target_available_regs[type], inuse); + + tcg_out_mov(s, type, val, inout); + } + + /* Arrange for the field to be at offset 0. */ + if (ofs != 0) { + tcg_out_shifti(s, SHIFT_ROR + rexw, inout, ofs); + } + + /* Shift the value into the top of the word. This shifts the old + field out of the bottom of the word and leaves us with the whole + word rotated right by the size of the field. */ + tcg_out_modrm(s, OPC_SHRD_Ib + rexw, val, inout); + tcg_out8(s, len); + + /* Restore the field to its proper location. */ + ofs = (len + ofs) & (rexw ? 63 : 31); + if (ofs != 0) { + tcg_out_shifti(s, SHIFT_ROL + rexw, inout, ofs); + } +} + + /* Use SMALL != 0 to force a short forward branch. */ static void tcg_out_jxx(TCGContext *s, int opc, int label_index, int small) { @@ -1266,7 +1321,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi, switch (sizeop) { case 0: - tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R, datalo, base, ofs); + tcg_out_modrm_offset(s, OPC_MOVB_EbGb + P_REXB_R, datalo, base, ofs); break; case 1: if (bswap) { @@ -1504,7 +1559,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, break; OP_32_64(st8): - tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, + tcg_out_modrm_offset(s, OPC_MOVB_EbGb | P_REXB_R, args[0], args[1], args[2]); break; OP_32_64(st16): @@ -1603,6 +1658,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + OP_32_64(deposit): + tcg_out_deposit(s, args[0], args[2], + args[3] >> 8, args[3] & 0xff, rexw); + break; + case INDEX_op_brcond_i32: tcg_out_brcond32(s, args[2], args[0], args[1], const_args[1], args[3], 0); @@ -1783,6 +1843,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_sar_i32, { "r", "0", "ci" } }, { INDEX_op_rotl_i32, { "r", "0", "ci" } }, { INDEX_op_rotr_i32, { "r", "0", "ci" } }, + { INDEX_op_deposit_i32, { "r", "0", "r" } }, { INDEX_op_brcond_i32, { "r", "ri" } }, @@ -1835,6 +1896,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_sar_i64, { "r", "0", "ci" } }, { INDEX_op_rotl_i64, { "r", "0", "ci" } }, { INDEX_op_rotr_i64, { "r", "0", "ci" } }, + { INDEX_op_deposit_i64, { "r", "0", "r" } }, { INDEX_op_brcond_i64, { "r", "re" } }, { INDEX_op_setcond_i64, { "r", "r", "re" } }, diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index bfafbfc..9f90d17 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -77,6 +77,7 @@ enum { /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 #define TCG_TARGET_HAS_rot_i32 +#define TCG_TARGET_HAS_deposit_i32 #define TCG_TARGET_HAS_ext8s_i32 #define TCG_TARGET_HAS_ext16s_i32 #define TCG_TARGET_HAS_ext8u_i32 @@ -94,6 +95,7 @@ enum { #if TCG_TARGET_REG_BITS == 64 #define TCG_TARGET_HAS_div2_i64 #define TCG_TARGET_HAS_rot_i64 +#define TCG_TARGET_HAS_deposit_i64 #define TCG_TARGET_HAS_ext8s_i64 #define TCG_TARGET_HAS_ext16s_i64 #define TCG_TARGET_HAS_ext32s_i64 diff --git a/tcg/tcg.c b/tcg/tcg.c index e95a42f..5ab9122 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -156,6 +156,8 @@ int gen_new_label(void) return idx; } +static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2); + #include "tcg-target.c" /* pool based memory allocation */
Special case deposits that are implementable with byte and word stores. Otherwise implement with double-word shift plus rotates. Expose tcg_reg_alloc to the backend for allocation of scratch registers. There's an edge condition that cannot actually happen at the moment due to a bug elsewhere in the register allocator, but it doesn't seem right to leave that unfixed. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/i386/tcg-target.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- tcg/i386/tcg-target.h | 2 + tcg/tcg.c | 2 + 3 files changed, 69 insertions(+), 3 deletions(-)