Message ID | 1271429444-900-2-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On 4/16/10, Richard Henderson <rth@twiddle.net> wrote: > Rather than creating new temporaries for constants, use the > ones created in disas_sparc_insn. Remember the temps created > there so that they can be freed at the end of the function. > > Profile data collected by TCG while booting sparc-test kernel: > > -avg temps/TB 70.61 max=421 > +avg temps/TB 62.75 max=66 > > Signed-off-by: Richard Henderson <rth@twiddle.net> Thanks, applied whole series. About this patch: it's good that we now free the constants, but constant handling is still not optimal and I think this series actually may add extra 'movi' ops in the worst case. It would be nice if we detected if constants are in play and call immediate versions (addi, subi etc) automatically. This may need bigger refactoring, though. > --- > target-sparc/translate.c | 52 +++++++++++++++++++++++---------------------- > 1 files changed, 27 insertions(+), 25 deletions(-) > > diff --git a/target-sparc/translate.c b/target-sparc/translate.c > index 2c07385..2c833ab 100644 > --- a/target-sparc/translate.c > +++ b/target-sparc/translate.c > @@ -49,7 +49,7 @@ static TCGv cpu_y; > #ifndef CONFIG_USER_ONLY > static TCGv cpu_tbr; > #endif > -static TCGv cpu_cond, cpu_src1, cpu_src2, cpu_dst, cpu_addr, cpu_val; > +static TCGv cpu_cond, cpu_dst, cpu_addr, cpu_val; > #ifdef TARGET_SPARC64 > static TCGv_i32 cpu_xcc, cpu_asi, cpu_fprs; > static TCGv cpu_gsr; > @@ -1631,12 +1631,13 @@ static inline TCGv get_src1(unsigned int insn, TCGv def) > unsigned int rs1; > > rs1 = GET_FIELD(insn, 13, 17); > - if (rs1 == 0) > - r_rs1 = tcg_const_tl(0); // XXX how to free? > - else if (rs1 < 8) > + if (rs1 == 0) { > + tcg_gen_movi_tl(def, 0); > + } else if (rs1 < 8) { > r_rs1 = cpu_gregs[rs1]; > - else > + } else { > tcg_gen_ld_tl(def, cpu_regwptr, (rs1 - 8) * sizeof(target_ulong)); > + } > return r_rs1; > } > > @@ -1645,20 +1646,17 @@ static inline TCGv get_src2(unsigned int insn, TCGv def) > TCGv r_rs2 = def; > > if (IS_IMM) { /* immediate */ > - target_long simm; > - > - simm = GET_FIELDs(insn, 19, 31); > - r_rs2 = tcg_const_tl(simm); // XXX how to free? > + target_long simm = GET_FIELDs(insn, 19, 31); > + tcg_gen_movi_tl(def, simm); > } else { /* register */ > - unsigned int rs2; > - > - rs2 = GET_FIELD(insn, 27, 31); > - if (rs2 == 0) > - r_rs2 = tcg_const_tl(0); // XXX how to free? > - else if (rs2 < 8) > + unsigned int rs2 = GET_FIELD(insn, 27, 31); > + if (rs2 == 0) { > + tcg_gen_movi_tl(def, 0); > + } else if (rs2 < 8) { > r_rs2 = cpu_gregs[rs2]; > - else > + } else { > tcg_gen_ld_tl(def, cpu_regwptr, (rs2 - 8) * sizeof(target_ulong)); > + } > } > return r_rs2; > } > @@ -1701,6 +1699,7 @@ static inline void gen_load_trap_state_at_tl(TCGv_ptr r_tsptr, TCGv_ptr cpu_env) > static void disas_sparc_insn(DisasContext * dc) > { > unsigned int insn, opc, rs1, rs2, rd; > + TCGv cpu_src1, cpu_src2, cpu_tmp1, cpu_tmp2; > target_long simm; > > if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) > @@ -1710,8 +1709,8 @@ static void disas_sparc_insn(DisasContext * dc) > > rd = GET_FIELD(insn, 2, 6); > > - cpu_src1 = tcg_temp_new(); // const > - cpu_src2 = tcg_temp_new(); // const > + cpu_tmp1 = cpu_src1 = tcg_temp_new(); > + cpu_tmp2 = cpu_src2 = tcg_temp_new(); > > switch (opc) { > case 0: /* branches/sethi */ > @@ -4599,7 +4598,7 @@ static void disas_sparc_insn(DisasContext * dc) > dc->npc = dc->npc + 4; > } > jmp_insn: > - return; > + goto egress; > illegal_insn: > { > TCGv_i32 r_const; > @@ -4610,7 +4609,7 @@ static void disas_sparc_insn(DisasContext * dc) > tcg_temp_free_i32(r_const); > dc->is_br = 1; > } > - return; > + goto egress; > unimp_flush: > { > TCGv_i32 r_const; > @@ -4621,7 +4620,7 @@ static void disas_sparc_insn(DisasContext * dc) > tcg_temp_free_i32(r_const); > dc->is_br = 1; > } > - return; > + goto egress; > #if !defined(CONFIG_USER_ONLY) > priv_insn: > { > @@ -4633,19 +4632,19 @@ static void disas_sparc_insn(DisasContext * dc) > tcg_temp_free_i32(r_const); > dc->is_br = 1; > } > - return; > + goto egress; > #endif > nfpu_insn: > save_state(dc, cpu_cond); > gen_op_fpexception_im(FSR_FTT_UNIMPFPOP); > dc->is_br = 1; > - return; > + goto egress; > #if !defined(CONFIG_USER_ONLY) && !defined(TARGET_SPARC64) > nfq_insn: > save_state(dc, cpu_cond); > gen_op_fpexception_im(FSR_FTT_SEQ_ERROR); > dc->is_br = 1; > - return; > + goto egress; > #endif > #ifndef TARGET_SPARC64 > ncp_insn: > @@ -4658,8 +4657,11 @@ static void disas_sparc_insn(DisasContext * dc) > tcg_temp_free(r_const); > dc->is_br = 1; > } > - return; > + goto egress; > #endif > + egress: > + tcg_temp_free(cpu_tmp1); > + tcg_temp_free(cpu_tmp2); > } > > static inline void gen_intermediate_code_internal(TranslationBlock * tb, > > -- > 1.6.6.1 > >
On 04/17/2010 11:41 AM, Blue Swirl wrote: > About this patch: it's good that we now free the constants, but > constant handling is still not optimal and I think this series > actually may add extra 'movi' ops in the worst case. It would be nice > if we detected if constants are in play and call immediate versions > (addi, subi etc) automatically. This may need bigger refactoring, > though. No, that won't help, since the first thing that addi, subi, etc do is to load the constant into a temporary. What would *really* help though, is something along the lines of Aurelien's constant propagation patch, followed by some mechanism to refactor constants in the backend. Aurelien's patch does a good job of building the full constant that the RISC instruction stream needed to use to generate the full 32-bit or 64-bit constant. If the host is x86, that's just about all we need. However, if the host is a RISC, we'll generally need to decompose the constant again. I've got the outline of an idea by which TCG can remember which constants are actually loaded into registers. And it should be designed so that the host backend can call into it to load other constants. In this way when we have a pair of constants like 0xfff00011 0xfff00022 the sparc backend can (if things go well with register allocation) load the %hi(0xfff00000) just once, and form the full constants with addition from there. r~
On 04/17/2010 12:49 PM, Richard Henderson wrote: > On 04/17/2010 11:41 AM, Blue Swirl wrote: >> About this patch: it's good that we now free the constants, but >> constant handling is still not optimal and I think this series >> actually may add extra 'movi' ops in the worst case. It would be nice >> if we detected if constants are in play and call immediate versions >> (addi, subi etc) automatically. This may need bigger refactoring, >> though. > > No, that won't help, since the first thing that addi, subi, etc > do is to load the constant into a temporary. > > What would *really* help though, is something along the lines of > Aurelien's constant propagation patch, followed by some mechanism > to refactor constants in the backend. ... Actually, I forgot to mention that the biggest thing that would help the Sparc target would be to eliminate the explicit loads/stores of the windowed reigsters, such that the generic TCG propagation and dead code elimination passes can do their job properly. I've been meaning to try changing the windowing code on the sparc to memcpy the registers into and out of fixed slots in the CPUState and see what kind of effect that has on overall performance. I have a feeling that it will be an improvement, since it should avoid some of the myriad of redundant loads and stores in the generated code. r~
On 4/17/10, Richard Henderson <rth@twiddle.net> wrote: > On 04/17/2010 11:41 AM, Blue Swirl wrote: > > About this patch: it's good that we now free the constants, but > > constant handling is still not optimal and I think this series > > actually may add extra 'movi' ops in the worst case. It would be nice > > if we detected if constants are in play and call immediate versions > > (addi, subi etc) automatically. This may need bigger refactoring, > > though. > > > No, that won't help, since the first thing that addi, subi, etc > do is to load the constant into a temporary. Yes, but we would still gain the small optimizations for add by 0, and with 0xffffffff etc. in tcg-op.h. Sparc QEMU target generates a lot of those because of poor constant formation choices made by the guest compilers. > What would *really* help though, is something along the lines of > Aurelien's constant propagation patch, followed by some mechanism > to refactor constants in the backend. > > Aurelien's patch does a good job of building the full constant > that the RISC instruction stream needed to use to generate the > full 32-bit or 64-bit constant. If the host is x86, that's just > about all we need. However, if the host is a RISC, we'll > generally need to decompose the constant again. > > I've got the outline of an idea by which TCG can remember which > constants are actually loaded into registers. And it should be > designed so that the host backend can call into it to load other > constants. In this way when we have a pair of constants like > > 0xfff00011 > 0xfff00022 > > the sparc backend can (if things go well with register allocation) > load the %hi(0xfff00000) just once, and form the full constants > with addition from there. That should be interesting. By the way, do you think constant pool approach (put constants at the end of TB) would be useful, especially for 64 bit constants?
On 4/17/10, Richard Henderson <rth@twiddle.net> wrote: > On 04/17/2010 12:49 PM, Richard Henderson wrote: > > On 04/17/2010 11:41 AM, Blue Swirl wrote: > >> About this patch: it's good that we now free the constants, but > >> constant handling is still not optimal and I think this series > >> actually may add extra 'movi' ops in the worst case. It would be nice > >> if we detected if constants are in play and call immediate versions > >> (addi, subi etc) automatically. This may need bigger refactoring, > >> though. > > > > No, that won't help, since the first thing that addi, subi, etc > > do is to load the constant into a temporary. > > > > What would *really* help though, is something along the lines of > > Aurelien's constant propagation patch, followed by some mechanism > > to refactor constants in the backend. > > > ... Actually, I forgot to mention that the biggest thing that would > help the Sparc target would be to eliminate the explicit loads/stores > of the windowed reigsters, such that the generic TCG propagation and > dead code elimination passes can do their job properly. I had postponed that until AREG1/2 are freed, now we could take one host register for regwptr. > I've been meaning to try changing the windowing code on the sparc to > memcpy the registers into and out of fixed slots in the CPUState and > see what kind of effect that has on overall performance. I have a > feeling that it will be an improvement, since it should avoid some > of the myriad of redundant loads and stores in the generated code. Maybe. Performance figures for that would be very interesting. Alternative approach could be that If we could rely on all hosts (especially x86) having plenty of registers, we could even use different regwptrs for %o, %l and %i sets. Then there are host page mapping tricks (map the same page at two locations to simulate wrapping) but that may be too tricky.
On 04/17/2010 01:41 PM, Blue Swirl wrote: > Yes, but we would still gain the small optimizations for add by 0, and > with 0xffffffff etc. in tcg-op.h. Sparc QEMU target generates a lot of > those because of poor constant formation choices made by the guest > compilers. Another thing that gets fixed by Aurelien's constant prop patch. Don't think that Sparc is alone in generating x+0 in a way that gets past the tcg-op.h checks. > By the way, do you think constant pool approach (put constants at the > end of TB) would be useful, especially for 64 bit constants? Probably. The support for that could probably be leveraged to move the TLB miss code path out of line as well. r~
diff --git a/target-sparc/translate.c b/target-sparc/translate.c index 2c07385..2c833ab 100644 --- a/target-sparc/translate.c +++ b/target-sparc/translate.c @@ -49,7 +49,7 @@ static TCGv cpu_y; #ifndef CONFIG_USER_ONLY static TCGv cpu_tbr; #endif -static TCGv cpu_cond, cpu_src1, cpu_src2, cpu_dst, cpu_addr, cpu_val; +static TCGv cpu_cond, cpu_dst, cpu_addr, cpu_val; #ifdef TARGET_SPARC64 static TCGv_i32 cpu_xcc, cpu_asi, cpu_fprs; static TCGv cpu_gsr; @@ -1631,12 +1631,13 @@ static inline TCGv get_src1(unsigned int insn, TCGv def) unsigned int rs1; rs1 = GET_FIELD(insn, 13, 17); - if (rs1 == 0) - r_rs1 = tcg_const_tl(0); // XXX how to free? - else if (rs1 < 8) + if (rs1 == 0) { + tcg_gen_movi_tl(def, 0); + } else if (rs1 < 8) { r_rs1 = cpu_gregs[rs1]; - else + } else { tcg_gen_ld_tl(def, cpu_regwptr, (rs1 - 8) * sizeof(target_ulong)); + } return r_rs1; } @@ -1645,20 +1646,17 @@ static inline TCGv get_src2(unsigned int insn, TCGv def) TCGv r_rs2 = def; if (IS_IMM) { /* immediate */ - target_long simm; - - simm = GET_FIELDs(insn, 19, 31); - r_rs2 = tcg_const_tl(simm); // XXX how to free? + target_long simm = GET_FIELDs(insn, 19, 31); + tcg_gen_movi_tl(def, simm); } else { /* register */ - unsigned int rs2; - - rs2 = GET_FIELD(insn, 27, 31); - if (rs2 == 0) - r_rs2 = tcg_const_tl(0); // XXX how to free? - else if (rs2 < 8) + unsigned int rs2 = GET_FIELD(insn, 27, 31); + if (rs2 == 0) { + tcg_gen_movi_tl(def, 0); + } else if (rs2 < 8) { r_rs2 = cpu_gregs[rs2]; - else + } else { tcg_gen_ld_tl(def, cpu_regwptr, (rs2 - 8) * sizeof(target_ulong)); + } } return r_rs2; } @@ -1701,6 +1699,7 @@ static inline void gen_load_trap_state_at_tl(TCGv_ptr r_tsptr, TCGv_ptr cpu_env) static void disas_sparc_insn(DisasContext * dc) { unsigned int insn, opc, rs1, rs2, rd; + TCGv cpu_src1, cpu_src2, cpu_tmp1, cpu_tmp2; target_long simm; if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) @@ -1710,8 +1709,8 @@ static void disas_sparc_insn(DisasContext * dc) rd = GET_FIELD(insn, 2, 6); - cpu_src1 = tcg_temp_new(); // const - cpu_src2 = tcg_temp_new(); // const + cpu_tmp1 = cpu_src1 = tcg_temp_new(); + cpu_tmp2 = cpu_src2 = tcg_temp_new(); switch (opc) { case 0: /* branches/sethi */ @@ -4599,7 +4598,7 @@ static void disas_sparc_insn(DisasContext * dc) dc->npc = dc->npc + 4; } jmp_insn: - return; + goto egress; illegal_insn: { TCGv_i32 r_const; @@ -4610,7 +4609,7 @@ static void disas_sparc_insn(DisasContext * dc) tcg_temp_free_i32(r_const); dc->is_br = 1; } - return; + goto egress; unimp_flush: { TCGv_i32 r_const; @@ -4621,7 +4620,7 @@ static void disas_sparc_insn(DisasContext * dc) tcg_temp_free_i32(r_const); dc->is_br = 1; } - return; + goto egress; #if !defined(CONFIG_USER_ONLY) priv_insn: { @@ -4633,19 +4632,19 @@ static void disas_sparc_insn(DisasContext * dc) tcg_temp_free_i32(r_const); dc->is_br = 1; } - return; + goto egress; #endif nfpu_insn: save_state(dc, cpu_cond); gen_op_fpexception_im(FSR_FTT_UNIMPFPOP); dc->is_br = 1; - return; + goto egress; #if !defined(CONFIG_USER_ONLY) && !defined(TARGET_SPARC64) nfq_insn: save_state(dc, cpu_cond); gen_op_fpexception_im(FSR_FTT_SEQ_ERROR); dc->is_br = 1; - return; + goto egress; #endif #ifndef TARGET_SPARC64 ncp_insn: @@ -4658,8 +4657,11 @@ static void disas_sparc_insn(DisasContext * dc) tcg_temp_free(r_const); dc->is_br = 1; } - return; + goto egress; #endif + egress: + tcg_temp_free(cpu_tmp1); + tcg_temp_free(cpu_tmp2); } static inline void gen_intermediate_code_internal(TranslationBlock * tb,
Rather than creating new temporaries for constants, use the ones created in disas_sparc_insn. Remember the temps created there so that they can be freed at the end of the function. Profile data collected by TCG while booting sparc-test kernel: -avg temps/TB 70.61 max=421 +avg temps/TB 62.75 max=66 Signed-off-by: Richard Henderson <rth@twiddle.net> --- target-sparc/translate.c | 52 +++++++++++++++++++++++---------------------- 1 files changed, 27 insertions(+), 25 deletions(-)