Message ID | 1347917713-23343-6-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On Mon, Sep 17, 2012 at 02:35:11PM -0700, Richard Henderson wrote: > With normal FP, this doesn't have much affect on the generated code, > because most of the FP operations are not CONST/PURE, and so we spill > registers in about the same frequency as the explicit load/stores. > > But with Loongson multimedia instructions, which are all integral and > whose helpers are in fact CONST+PURE, this greatly improves the code. > > Rather than over-use the deposit operation, we create TCG registers for > both the 64-bit FPU register as a whole and the two 32-bit halves. We > only ever reference the whole register or the two half registers in any > one TB, so there's no problem with aliasing. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > target-mips/translate.c | 141 +++++++++++++++++++++++++++++++++--------------- > 1 file changed, 97 insertions(+), 44 deletions(-) > > diff --git a/target-mips/translate.c b/target-mips/translate.c > index b4301e9..df92cec 100644 > --- a/target-mips/translate.c > +++ b/target-mips/translate.c > @@ -479,6 +479,12 @@ static TCGv cpu_dspctrl, btarget, bcond; > static TCGv_i32 hflags; > static TCGv_i32 fpu_fcr0, fpu_fcr31; > > +/* FPU registers. These alias, but we'll only use one or the other > + in any one TB based on MIPS_HFLAG_F64. */ > +static TCGv_i32 fpu_f32[32]; > +static TCGv_i32 fpu_fh32[32]; > +static TCGv_i64 fpu_f64[32]; > + > static uint32_t gen_opc_hflags[OPC_BUF_SIZE]; > > #include "gen-icount.h" > @@ -545,26 +551,45 @@ enum { > BS_EXCP = 3, /* We reached an exception condition */ > }; > > -static const char *regnames[] = > - { "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3", > - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", > - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", > - "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", }; > +static const char * const regnames[] = { > + "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3", > + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", > + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", > + "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", > +}; > + > +static const char * const regnames_HI[] = { > + "HI0", "HI1", "HI2", "HI3", > +}; > > -static const char *regnames_HI[] = > - { "HI0", "HI1", "HI2", "HI3", }; > +static const char * const regnames_LO[] = { > + "LO0", "LO1", "LO2", "LO3", > +}; > > -static const char *regnames_LO[] = > - { "LO0", "LO1", "LO2", "LO3", }; > +static const char * const regnames_ACX[] = { > + "ACX0", "ACX1", "ACX2", "ACX3", > +}; > > -static const char *regnames_ACX[] = > - { "ACX0", "ACX1", "ACX2", "ACX3", }; > +static const char * const fregnames[] = { > + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", > + "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", > + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", > + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", > +}; > > -static const char *fregnames[] = > - { "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", > - "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", > - "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", > - "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", }; > +static const char * const flregnames[] = { > + "fl0", "fl1", "fl2", "fl3", "fl4", "fl5", "fl6", "fl7", > + "fl8", "fl9", "fl10", "fl11", "fl12", "fl13", "fl14", "fl15", > + "fl16", "fl17", "fl18", "fl19", "fl20", "fl21", "fl22", "fl23", > + "fl24", "fl25", "fl26", "fl27", "fl28", "fl29", "fl30", "fl31", > +}; > + > +static const char * const fhregnames[] = { > + "fh0", "fh1", "fh2", "fh3", "fh4", "fh5", "fh6", "fh7", > + "fh8", "fh9", "fh10", "fh11", "fh12", "fh13", "fh14", "fh15", > + "fh16", "fh17", "fh18", "fh19", "fh20", "fh21", "fh22", "fh23", > + "fh24", "fh25", "fh26", "fh27", "fh28", "fh29", "fh30", "fh31", > +}; > > #ifdef MIPS_DEBUG_DISAS > #define MIPS_DEBUG(fmt, ...) \ > @@ -662,55 +687,70 @@ static inline void gen_store_srsgpr (int from, int to) > } > > /* Floating point register moves. */ > -static inline void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) > +static void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) > { > - tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX])); > + if (ctx->hflags & MIPS_HFLAG_F64) { > + tcg_gen_trunc_i64_i32(t, fpu_f64[reg]); > + } else { > + tcg_gen_mov_i32(t, fpu_f32[reg]); > + } > } > > -static inline void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) > +static void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) > { > - tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX])); > + if (ctx->hflags & MIPS_HFLAG_F64) { > + TCGv_i64 t64 = tcg_temp_new_i64(); > + tcg_gen_ext_i32_i64(t64, t); > + tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 0, 32); > + tcg_temp_free_i64(t64); > + } else { > + tcg_gen_mov_i32(fpu_f32[reg], t); > + } > } > > -static inline void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) > +static void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) > { > - tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX])); > + if (ctx->hflags & MIPS_HFLAG_F64) { > + TCGv_i64 t64 = tcg_temp_new_i64(); > + tcg_gen_shri_i64(t64, fpu_f64[reg], 32); > + tcg_gen_trunc_i64_i32(t, t64); > + tcg_temp_free_i64(t64); > + } else { > + tcg_gen_mov_i32(t, fpu_fh32[reg]); > + } > } > > -static inline void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) > +static void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) > { > - tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX])); > + if (ctx->hflags & MIPS_HFLAG_F64) { > + TCGv_i64 t64 = tcg_temp_new_i64(); > + tcg_gen_ext_i32_i64(t64, t); > + tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 32, 32); > + tcg_temp_free_i64(t64); > + } else { > + tcg_gen_mov_i32(fpu_fh32[reg], t); > + } > } > > -static inline void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) > +static void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) > { > if (ctx->hflags & MIPS_HFLAG_F64) { > - tcg_gen_ld_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d)); > + tcg_gen_mov_i64(t, fpu_f64[reg]); > } else { > - TCGv_i32 t0 = tcg_temp_new_i32(); > - TCGv_i32 t1 = tcg_temp_new_i32(); > - gen_load_fpr32(ctx, t0, reg & ~1); > - gen_load_fpr32(ctx, t1, reg | 1); > - tcg_gen_concat_i32_i64(t, t0, t1); > - tcg_temp_free_i32(t0); > - tcg_temp_free_i32(t1); > + tcg_gen_concat_i32_i64(t, fpu_f32[reg & ~1], fpu_f32[reg | 1]); > } > } > > -static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) > +static void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) > { > if (ctx->hflags & MIPS_HFLAG_F64) { > - tcg_gen_st_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d)); > + tcg_gen_mov_i64(fpu_f64[reg], t); > } else { > - TCGv_i64 t0 = tcg_temp_new_i64(); > - TCGv_i32 t1 = tcg_temp_new_i32(); > - tcg_gen_trunc_i64_i32(t1, t); > - gen_store_fpr32(ctx, t1, reg & ~1); > - tcg_gen_shri_i64(t0, t, 32); > - tcg_gen_trunc_i64_i32(t1, t0); > - gen_store_fpr32(ctx, t1, reg | 1); > - tcg_temp_free_i32(t1); > - tcg_temp_free_i64(t0); > + TCGv_i64 t64 = tcg_temp_new_i64(); > + tcg_gen_shri_i64(t64, t, 32); > + tcg_gen_trunc_i64_i32(fpu_f32[reg | 1], t64); > + tcg_temp_free_i64(t64); > + tcg_gen_trunc_i64_i32(fpu_f32[reg & ~1], t); > } > } > > @@ -12694,6 +12734,19 @@ static void mips_tcg_init(void) > offsetof(CPUMIPSState, active_fpu.fcr31), > "fcr31"); > > + for (i = 0; i < 32; i++) { > + int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]); > + fpu_f32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, flregnames[i]); > + } > + for (i = 0; i < 32; i++) { > + int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[!FP_ENDIAN_IDX]); > + fpu_fh32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fhregnames[i]); > + } > + for (i = 0; i < 32; i++) { > + int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]); This should be fpr[i].d. > + fpu_f64[i] = tcg_global_mem_new_i64(TCG_AREG0, off, fregnames[i]); > + } > + Adding so many globals (i.e. multiplying by 4) has a cost that is greater than the gains. Remember the register allocator is doing a loop on all globals at the end of a basic block or when calling a non CONST helper/op. While the generated code looks nicer, this slow down the guest by roughly 12% (measured on a boot time). I am currently working on an optimization of the liveness/register allocator which among other things, partly mitigates that (I hope to get the patches ready for posting in a week or so). That said the slow down is still around 3%. I think we should go for only mapping the fp registers as 64-bit registers, and use trunc/shift/deposit to read/write them. Of course the generated code doesn't look so nice, but what is important is that the overall execution is faster, not slower. > /* register helpers */ > #define GEN_HELPER 2 > #include "helper.h" > -- > 1.7.11.4 >
diff --git a/target-mips/translate.c b/target-mips/translate.c index b4301e9..df92cec 100644 --- a/target-mips/translate.c +++ b/target-mips/translate.c @@ -479,6 +479,12 @@ static TCGv cpu_dspctrl, btarget, bcond; static TCGv_i32 hflags; static TCGv_i32 fpu_fcr0, fpu_fcr31; +/* FPU registers. These alias, but we'll only use one or the other + in any one TB based on MIPS_HFLAG_F64. */ +static TCGv_i32 fpu_f32[32]; +static TCGv_i32 fpu_fh32[32]; +static TCGv_i64 fpu_f64[32]; + static uint32_t gen_opc_hflags[OPC_BUF_SIZE]; #include "gen-icount.h" @@ -545,26 +551,45 @@ enum { BS_EXCP = 3, /* We reached an exception condition */ }; -static const char *regnames[] = - { "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3", - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", - "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", }; +static const char * const regnames[] = { + "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3", + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", + "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", +}; + +static const char * const regnames_HI[] = { + "HI0", "HI1", "HI2", "HI3", +}; -static const char *regnames_HI[] = - { "HI0", "HI1", "HI2", "HI3", }; +static const char * const regnames_LO[] = { + "LO0", "LO1", "LO2", "LO3", +}; -static const char *regnames_LO[] = - { "LO0", "LO1", "LO2", "LO3", }; +static const char * const regnames_ACX[] = { + "ACX0", "ACX1", "ACX2", "ACX3", +}; -static const char *regnames_ACX[] = - { "ACX0", "ACX1", "ACX2", "ACX3", }; +static const char * const fregnames[] = { + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", + "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", +}; -static const char *fregnames[] = - { "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", - "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", - "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", - "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", }; +static const char * const flregnames[] = { + "fl0", "fl1", "fl2", "fl3", "fl4", "fl5", "fl6", "fl7", + "fl8", "fl9", "fl10", "fl11", "fl12", "fl13", "fl14", "fl15", + "fl16", "fl17", "fl18", "fl19", "fl20", "fl21", "fl22", "fl23", + "fl24", "fl25", "fl26", "fl27", "fl28", "fl29", "fl30", "fl31", +}; + +static const char * const fhregnames[] = { + "fh0", "fh1", "fh2", "fh3", "fh4", "fh5", "fh6", "fh7", + "fh8", "fh9", "fh10", "fh11", "fh12", "fh13", "fh14", "fh15", + "fh16", "fh17", "fh18", "fh19", "fh20", "fh21", "fh22", "fh23", + "fh24", "fh25", "fh26", "fh27", "fh28", "fh29", "fh30", "fh31", +}; #ifdef MIPS_DEBUG_DISAS #define MIPS_DEBUG(fmt, ...) \ @@ -662,55 +687,70 @@ static inline void gen_store_srsgpr (int from, int to) } /* Floating point register moves. */ -static inline void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) +static void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) { - tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX])); + if (ctx->hflags & MIPS_HFLAG_F64) { + tcg_gen_trunc_i64_i32(t, fpu_f64[reg]); + } else { + tcg_gen_mov_i32(t, fpu_f32[reg]); + } } -static inline void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) +static void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg) { - tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX])); + if (ctx->hflags & MIPS_HFLAG_F64) { + TCGv_i64 t64 = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(t64, t); + tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 0, 32); + tcg_temp_free_i64(t64); + } else { + tcg_gen_mov_i32(fpu_f32[reg], t); + } } -static inline void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) +static void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) { - tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX])); + if (ctx->hflags & MIPS_HFLAG_F64) { + TCGv_i64 t64 = tcg_temp_new_i64(); + tcg_gen_shri_i64(t64, fpu_f64[reg], 32); + tcg_gen_trunc_i64_i32(t, t64); + tcg_temp_free_i64(t64); + } else { + tcg_gen_mov_i32(t, fpu_fh32[reg]); + } } -static inline void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) +static void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg) { - tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX])); + if (ctx->hflags & MIPS_HFLAG_F64) { + TCGv_i64 t64 = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(t64, t); + tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 32, 32); + tcg_temp_free_i64(t64); + } else { + tcg_gen_mov_i32(fpu_fh32[reg], t); + } } -static inline void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) +static void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) { if (ctx->hflags & MIPS_HFLAG_F64) { - tcg_gen_ld_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d)); + tcg_gen_mov_i64(t, fpu_f64[reg]); } else { - TCGv_i32 t0 = tcg_temp_new_i32(); - TCGv_i32 t1 = tcg_temp_new_i32(); - gen_load_fpr32(ctx, t0, reg & ~1); - gen_load_fpr32(ctx, t1, reg | 1); - tcg_gen_concat_i32_i64(t, t0, t1); - tcg_temp_free_i32(t0); - tcg_temp_free_i32(t1); + tcg_gen_concat_i32_i64(t, fpu_f32[reg & ~1], fpu_f32[reg | 1]); } } -static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) +static void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg) { if (ctx->hflags & MIPS_HFLAG_F64) { - tcg_gen_st_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d)); + tcg_gen_mov_i64(fpu_f64[reg], t); } else { - TCGv_i64 t0 = tcg_temp_new_i64(); - TCGv_i32 t1 = tcg_temp_new_i32(); - tcg_gen_trunc_i64_i32(t1, t); - gen_store_fpr32(ctx, t1, reg & ~1); - tcg_gen_shri_i64(t0, t, 32); - tcg_gen_trunc_i64_i32(t1, t0); - gen_store_fpr32(ctx, t1, reg | 1); - tcg_temp_free_i32(t1); - tcg_temp_free_i64(t0); + TCGv_i64 t64 = tcg_temp_new_i64(); + tcg_gen_shri_i64(t64, t, 32); + tcg_gen_trunc_i64_i32(fpu_f32[reg | 1], t64); + tcg_temp_free_i64(t64); + tcg_gen_trunc_i64_i32(fpu_f32[reg & ~1], t); } } @@ -12694,6 +12734,19 @@ static void mips_tcg_init(void) offsetof(CPUMIPSState, active_fpu.fcr31), "fcr31"); + for (i = 0; i < 32; i++) { + int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]); + fpu_f32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, flregnames[i]); + } + for (i = 0; i < 32; i++) { + int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[!FP_ENDIAN_IDX]); + fpu_fh32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fhregnames[i]); + } + for (i = 0; i < 32; i++) { + int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]); + fpu_f64[i] = tcg_global_mem_new_i64(TCG_AREG0, off, fregnames[i]); + } + /* register helpers */ #define GEN_HELPER 2 #include "helper.h"
With normal FP, this doesn't have much affect on the generated code, because most of the FP operations are not CONST/PURE, and so we spill registers in about the same frequency as the explicit load/stores. But with Loongson multimedia instructions, which are all integral and whose helpers are in fact CONST+PURE, this greatly improves the code. Rather than over-use the deposit operation, we create TCG registers for both the 64-bit FPU register as a whole and the two 32-bit halves. We only ever reference the whole register or the two half registers in any one TB, so there's no problem with aliasing. Signed-off-by: Richard Henderson <rth@twiddle.net> --- target-mips/translate.c | 141 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 44 deletions(-)